ROCm · pragupta · Oct 1, 2025 · Sep 18, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
@@ -1,5 +1 @@
-<<<<<<< HEAD
 6193b30becb1ac7be704cf87b8cb9bf13e7f9689
-=======
-bbb06c0334a6772b92d24bde54956e675c8c6604
->>>>>>> upstream/main
@@ -114,12 +114,9 @@ EOF
         rm -rf HIP clr
     fi
 
-<<<<<<< HEAD
     # temporary hipblasLT dependency install
     apt install libmsgpackc2
-=======
     pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
->>>>>>> upstream/main
 
     # Cleanup
     apt-get autoclean && apt-get clean
@@ -131,8 +128,8 @@ install_centos() {
   yum update -y
   yum install -y kmod
   yum install -y wget
-  
-  if [[ $OS_VERSION == 9 ]]; then 
+
+  if [[ $OS_VERSION == 9 ]]; then
       dnf install -y openblas-serial
       dnf install -y dkms kernel-headers kernel-devel
   else

@@ -112,13 +112,8 @@ ninja==1.11.1.3
 #Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
-<<<<<<< HEAD
 numba==0.60.0 ; python_version == "3.9"
 numba==0.61.2 ; python_version > "3.9"
-=======
-numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
-numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
->>>>>>> upstream/main
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
@@ -137,14 +132,8 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-<<<<<<< HEAD
 numpy==2.0.2 ; python_version == "3.9"
 numpy==2.1.2 ; python_version > "3.9"
-=======
-numpy==1.22.4; python_version == "3.10"
-numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13"
->>>>>>> upstream/main
 
 pandas==2.2.3
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -896,8 +896,6 @@ cmake_dependent_option(
   "USE_CUDA OR USE_ROCM"
   OFF)
 
-<<<<<<< HEAD
-=======
 IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
   message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
   set(USE_FBGEMM_GENAI off)
@@ -909,7 +907,6 @@ if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPI
   set(USE_FBGEMM_GENAI ON)
 endif()
 
->>>>>>> upstream/main
 # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
 # Eff Attention won't
 cmake_dependent_option(

diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
@@ -671,13 +671,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
       std::cout << "PYTORCH_MIOPEN_EXTRA_LOGGING: ********************* _batch_norm_impl_index (calling miopen_batch_norm)" << std::endl;
     return std::tuple_cat(
              at::miopen_batch_norm(
-<<<<<<< HEAD
-               input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(),
-=======
                input.contiguous(input.suggest_memory_format()),
                weight.contiguous(),
                bias.contiguous(),
->>>>>>> upstream/main
                running_mean.defined() ? running_mean.contiguous() : running_mean,
                running_var.defined() ? running_var.contiguous() : running_var,
                training, momentum, eps),

@@ -103,11 +103,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
     mode = miopenBNSpatial;
   }
 
-<<<<<<< HEAD
-  auto output_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
-=======
   auto output_t = at::empty_like(input_t, input_t.options(), input_t.suggest_memory_format());
->>>>>>> upstream/main
   TensorArg output{ output_t, "output", 0 };
 
   auto handle = getMiopenHandle();
@@ -180,18 +176,10 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
 
   auto grad_output_contig =
       grad_output_t.contiguous(input_t.suggest_memory_format());
-<<<<<<< HEAD
-  TensorArg input{ input_t, "input", 1 },
-            grad_output{ grad_output_contig, "grad_output", 2 },
-            weight{ weight_t, "weight", 3 },
-            save_mean{ save_mean_t, "save_mean", 4 },
-            save_var{ save_var_t, "save_var", 5 };
-=======
   TensorArg input{input_t, "input", 1},
       grad_output{grad_output_contig, "grad_output", 2},
       weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4},
       save_var{save_var_t, "save_var", 5};
->>>>>>> upstream/main
   CheckedFrom c = "miopen_batch_norm_backward";
 
   checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
@@ -203,13 +191,9 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
   }
   checkAllSameType(c, {input, grad_output});
   checkAllSameType(c, {weight, save_mean, save_var});
-<<<<<<< HEAD
-  checkAllContiguous(c, {save_mean, save_var});
-=======
   // TODO: is weight required to be contiguous?
   checkAllContiguous(c, {save_mean, save_var});
   // TODO: TensorArg check should start handle memory format
->>>>>>> upstream/main
   TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
   TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
   checkDimRange(c, input, 2, 6 /* exclusive */);
@@ -226,12 +210,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
     mode = miopenBNSpatial;
   }
 
-<<<<<<< HEAD
-  auto grad_input_t = at::empty(
-      input->sizes(), input->options(), input->suggest_memory_format());
-=======
   auto grad_input_t  = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
->>>>>>> upstream/main
   auto grad_weight_t = at::empty(weight->sizes(), weight->options());
   auto grad_bias_t   = at::empty(weight->sizes(), weight->options());
 

diff --git a/requirements-build.txt b/requirements-build.txt
@@ -1,6 +1,5 @@
 # Build System requirements
 setuptools>=70.1.0,<80.0  # setuptools develop deprecated on 80.0
-<<<<<<< HEAD
 cmake>=3.31.4
 ninja==1.11.1.3
 numpy==2.0.2 ; python_version == "3.9"
@@ -10,14 +9,4 @@ pyyaml==6.0.2
 requests==2.32.4
 six==1.17.0  # dependency chain: NNPACK -> PeachPy -> six
 typing-extensions==4.14.1
-=======
-cmake>=3.27
-ninja
-numpy
-packaging
-pyyaml
-requests
-six  # dependency chain: NNPACK -> PeachPy -> six
-typing-extensions>=4.10.0
 pip  # not technically needed, but this makes setup.py invocation work
->>>>>>> upstream/main
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
@@ -50,12 +50,6 @@
     parametrize as parametrize_test,
     run_tests,
     set_default_dtype,
-<<<<<<< HEAD
-    skipIfRocm,
-    skipIfNotMiopenSuggestNHWC,
-    skipIfRocmVersionLessThan,
-=======
->>>>>>> upstream/main
     subtest,
     TEST_SCIPY,
     TEST_WITH_ROCM,
@@ -4033,16 +4027,9 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
 
     @skipCUDAIfRocm
     @onlyCUDA
-<<<<<<< HEAD
-    @largeTensorTest('40GB')
-    @largeTensorTest('24GB', 'cpu')
-    # Skipped for ROCm temp - https://ontrack-internal.amd.com/browse/SWDEV-383635
-    @skipIfRocm
-=======
     @largeTensorTest("40GB")
     @largeTensorTest("24GB", "cpu")
     @tf32_on_and_off(0.005)
->>>>>>> upstream/main
     def test_conv3d_64bit_indexing(self, device):
         x = torch.rand(1, 32, 512, 512, 256)
         m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)

diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
@@ -1481,11 +1481,7 @@ def to_np(value):
                 elif torch.can_cast(torch.result_type(base, exponent), base.dtype):
                     actual2 = actual.pow_(exponent)
                     self.assertEqual(actual, expected.to(actual))
-<<<<<<< HEAD
-                    self.assertEqual(actual2, expected.to(actual))
-=======
                     self.assertEqual(actual2, expected.to(actual2))
->>>>>>> upstream/main
                 else:
                     self.assertRaisesRegex(
                         RuntimeError,

diff --git a/test/test_nn.py b/test/test_nn.py
@@ -5199,24 +5199,6 @@ def test_batchnorm_nhwc_cuda(self):
         name_fn=lambda f, b, m, t: f"{f}_vs_{b}{'_mixed' if m else ''}_{dtype_name(t)}"
     )
     def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype):
-<<<<<<< HEAD
-        if self._testMethodName == "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16":
-           self.skipTest("3D float16 NCHW train failed on CUDA and ROCm due to Native batchnorm accuracy issue SWDEV-541024")
-        if torch.version.hip:
-            if self._testMethodName in ("test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16",
-                                    "test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
-                                    "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16",
-                                    "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16"
-                                    ) and _get_torch_rocm_version() < (6, 4):
-                # NCHW bfloat16 path uses native kernels for rocm<=6.3
-                # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
-                self.skipTest("bfloat16 NHWC train failed on ROCm <= 6.3")
-
-            if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_native_mixed_bfloat16",
-                                        "test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16"
-                                        ) and _get_torch_rocm_version() >= (6, 4):
-                self.skipTest("bfloat16 NCHW train failed due to native tolerance issue SWDEV-507600")
-=======
         if torch.version.cuda:
             if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
                                         "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16",
@@ -5244,7 +5226,6 @@ def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype):
 
             if self._testMethodName == "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16":
                 self.skipTest("3D float16 NCHW train failed on ROCm")
->>>>>>> upstream/main
 
         if dims == 3 and memory_format in ("NHWC", "NCHW"):
             memory_format = memory_format + "3D"

diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -2924,7 +2924,7 @@ def _persistent_reduction_configs(
         for xblock in (1, 8, 32, 128)
         if xblock == 1 or (xblock <= xnumel and (max_autotune_enabled or rnumel * xblock <= 4096))
     ]
-    
+
     if "y" not in size_hints:
         configs = [
             triton_config_reduction(
@@ -2958,17 +2958,6 @@ def _persistent_reduction_configs(
     # defer to more autotuning, initially
     if "y" in size_hints:
         pass
-<<<<<<< HEAD
-
-    if not max_autotune_enabled: # Don't filter if tuning enabled
-        if reduction_hint == ReductionHint.INNER and rnumel >= 256:
-            configs = configs[:1]
-        elif reduction_hint == ReductionHint.OUTER:
-            configs = configs[-1:]
-
-    if reduction_hint == ReductionHint.OUTER_TINY:
-        tiny_configs = [
-=======
     # TODO(jansel): we should be able to improve these heuristics
     elif reduction_hint == ReductionHint.INNER:
         if rnumel > 1024:
@@ -2995,7 +2984,6 @@ def _persistent_reduction_configs(
         configs = configs[-1:]
     elif reduction_hint == ReductionHint.OUTER_TINY:
         configs = [
->>>>>>> upstream/main
             triton_config_reduction(
                 size_hints,
                 2 * (256 // rnumel) if rnumel <= 256 else 1,

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
@@ -102,8 +102,8 @@
     has_pytest = False
 
 
-<<<<<<< HEAD
 MI300_ARCH = ("gfx940", "gfx941", "gfx942")
+MI200_ARCH = ("gfx90a")
 NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")
 NAVI3_ARCH = ("gfx1100", "gfx1101")
 NAVI4_ARCH = ("gfx1200", "gfx1201")
@@ -115,10 +115,6 @@ def is_arch(arch_list):
         if gfx_arch in arch_list:
             return True
     return False
-=======
-MI300_ARCH = ("gfx942",)
-MI200_ARCH = ("gfx90a")
->>>>>>> upstream/main
 
 def freeze_rng_state(*args, **kwargs):
     return torch.testing._utils.freeze_rng_state(*args, **kwargs)