ROCm · mstankov-amd · Aug 15, 2025 · Aug 15, 2025 · Aug 18, 2025 · Aug 19, 2025
@@ -258,11 +258,6 @@ scipy==1.14.1 ; python_version > "3.9"
 #Pinned versions:
 #test that import:
 
-tb-nightly==2.13.0a20230426
-#Description: TensorBoard
-#Pinned versions:
-#test that import:
-
 # needed by torchgen utils
 typing-extensions>=4.10.0
 #Description: type hints for python

diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
@@ -101,8 +101,12 @@ def build_triton(
 
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
-            triton_pkg_name = "pytorch-triton-rocm"
             triton_repo = "https://github.com/ROCm/triton"
+            rocm_version = get_rocm_version()  # e.g., "7.0.1"
+            if tuple(map(int, rocm_version.split("."))) > (7, 0, 0):
+                triton_pkg_name = "triton"
+            else:
+                triton_pkg_name = "pytorch-triton-rocm"
         elif device == "xpu":
             triton_pkg_name = "pytorch-triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -336,7 +336,7 @@ at::BlasBackend Context::blasPreferredBackend() {
       static const std::vector<std::string> archs = {
           "gfx90a", "gfx942",
 #if ROCM_VERSION >= 60400
-          "gfx1200", "gfx1201",
+          "gfx1150", "gfx1151", "gfx1200", "gfx1201",
 #endif
 #if ROCM_VERSION >= 60500
           "gfx950"
@@ -362,7 +362,7 @@ at::BlasBackend Context::blasPreferredBackend() {
       static const std::vector<std::string> archs = {
           "gfx90a", "gfx942",
 #if ROCM_VERSION >= 60300
-          "gfx1100", "gfx1101", "gfx1200", "gfx1201",
+          "gfx1100", "gfx1101", "gfx1150", "gfx1151", "gfx1200", "gfx1201",
 #endif
 #if ROCM_VERSION >= 60500
           "gfx950"

@@ -39,7 +39,7 @@ struct DescriptorDeleter {
 // function.
 template <typename T, miopenStatus_t (*ctor)(T**), miopenStatus_t (*dtor)(T*)>
 // NOLINTNEXTLINE(bugprone-exception-escape)
-class TORCH_CUDA_CPP_API Descriptor {
+class TORCH_HIP_CPP_API Descriptor {
  public:
   // Use desc() to access the underlying descriptor pointer in
   // a read-only fashion.  Most client code should use this.
@@ -65,7 +65,7 @@ class TORCH_CUDA_CPP_API Descriptor {
   std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
 };
 
-class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
+class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor<
                                                miopenTensorDescriptor,
                                                &miopenCreateTensorDescriptor,
                                                &miopenDestroyTensorDescriptor> {
@@ -88,7 +88,7 @@ class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
 
-class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
+class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor<
                                                miopenTensorDescriptor,
                                                &miopenCreateTensorDescriptor,
                                                &miopenDestroyTensorDescriptor> {
@@ -105,7 +105,7 @@ class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
   }
 };
 
-struct TORCH_CUDA_CPP_API ConvolutionDescriptor
+struct TORCH_HIP_CPP_API ConvolutionDescriptor
     : public Descriptor<
           miopenConvolutionDescriptor,
           &miopenCreateConvolutionDescriptor,
@@ -121,7 +121,7 @@ struct TORCH_CUDA_CPP_API ConvolutionDescriptor
 };
 
 // NOLINTNEXTLINE(bugprone-exception-escape)
-struct TORCH_CUDA_CPP_API DropoutDescriptor
+struct TORCH_HIP_CPP_API DropoutDescriptor
     : public Descriptor<
           miopenDropoutDescriptor,
           &miopenCreateDropoutDescriptor,
@@ -137,7 +137,7 @@ struct TORCH_CUDA_CPP_API DropoutDescriptor
     }
 };
 
-struct TORCH_CUDA_CPP_API RNNDescriptor
+struct TORCH_HIP_CPP_API RNNDescriptor
   : public Descriptor<miopenRNNDescriptor,
                       &miopenCreateRNNDescriptor,
                       &miopenDestroyRNNDescriptor>

@@ -5,5 +5,5 @@
 
 namespace at::native {
 
-TORCH_CUDA_CPP_API miopenHandle_t getMiopenHandle();
+TORCH_HIP_CPP_API miopenHandle_t getMiopenHandle();
 } // namespace at::native
@@ -6,7 +6,7 @@
 
 namespace at::native {
 
-TORCH_CUDA_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor);
+TORCH_HIP_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor);
 
 int64_t miopen_version();
 

diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -270,7 +270,7 @@ static bool isSupportedHipLtROCmArch(int index) {
     static const std::vector<std::string> archs = {
         "gfx90a", "gfx942",
 #if ROCM_VERSION >= 60300
-        "gfx1100", "gfx1101", "gfx1200", "gfx1201",
+        "gfx1100", "gfx1101", "gfx1150", "gfx1151", "gfx1200", "gfx1201",
 #endif
 #if ROCM_VERSION >= 60500
         "gfx950"

diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -797,14 +797,23 @@ struct ReduceOp {
     if (should_store) {
       index_t offset = config.staging_memory_offset(blockIdx.y);
       reduce_buffer[offset] = value;
+#ifdef USE_ROCM
+      __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); // make sure writes are globally visible
+#endif
     }
 
+#ifndef USE_ROCM
     __threadfence(); // make sure writes are globally visible
+#endif
     __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
     bool is_last_block_done = mark_block_finished();
 
     if (is_last_block_done) {
+#ifdef USE_ROCM
+      __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); // complete the acquire pattern after release
+#else
       __threadfence(); // complete the acquire pattern after atomic
+#endif
       for (auto &v : value) {
         v = ident;
       }
@@ -822,6 +831,23 @@ struct ReduceOp {
       } else {
         index_t input_offset = threadIdx.y;
         index_t step = blockDim.y;
+#ifdef USE_ROCM // Prefetch loads to better hide their latency
+        #define PRFCH 4
+        for (; input_offset < config.ctas_per_output; input_offset += step*PRFCH) {
+         arg_vec_t next[PRFCH];
+         #pragma unroll
+         for (int u = 0; (u < PRFCH) && (input_offset + u*step < config.ctas_per_output); u++) {
+          index_t idx = config.staging_memory_offset(input_offset + u*step);
+          next[u] = reduce_buffer[idx];
+         }
+         for (int u = 0; (u < PRFCH) && (input_offset + u*step < config.ctas_per_output); u++) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine(value[i], next[u][i]);
+          }
+         }
+        }
+#else
         for (; input_offset < config.ctas_per_output; input_offset += step) {
           index_t idx = config.staging_memory_offset(input_offset);
           arg_vec_t next = reduce_buffer[idx];
@@ -830,6 +856,7 @@ struct ReduceOp {
             value[i] = ops.combine(value[i], next[i]);
           }
         }
+#endif
       }
       value = block_y_reduce<output_vec_size>(value, shared_memory);
       if (config.should_block_x_reduce()) {

diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
@@ -93,19 +93,16 @@ if(HIP_FOUND)
   # hip (lower-case) package. Both are probed above and will be in
   # ROCM_INCLUDE_DIRS if available.
   find_file(ROCM_VERSION_HEADER_PATH
-    NAMES rocm-core/rocm_version.h
+    NAMES rocm-core/rocm_version.h hip/hip_version.h
     NO_DEFAULT_PATH
     PATHS ${ROCM_INCLUDE_DIRS}
   )
-  set(ROCM_LIB_NAME "ROCM")
-  if(NOT ROCM_VERSION_HEADER_PATH)
-    find_file(ROCM_VERSION_HEADER_PATH
-      NAMES hip/hip_version.h
-      NO_DEFAULT_PATH
-      PATHS ${ROCM_INCLUDE_DIRS}
-    )
+  if(ROCM_VERSION_HEADER_PATH MATCHES "rocm-core/rocm_version.h$")
+    set(ROCM_LIB_NAME "ROCM")
+  else()
     set(ROCM_LIB_NAME "HIP")
   endif()
+
   if(NOT ROCM_VERSION_HEADER_PATH)
     message(FATAL_ERROR "Could not find hip/hip_version.h or rocm-core/rocm_version.h in ${ROCM_INCLUDE_DIRS}")
   endif()

diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
@@ -639,6 +639,14 @@ def _helper_test_extra_cuda_context_by_memory(self):
         """
         device = torch.device(f"cuda:{self.rank:d}")
         x = torch.empty((1,), device=device)
+
+        # We need this barrier to ensure that all nodes have completed init_process_group
+        # If rank=0 gets a mem snapshot before other nodes have finished init_process_group,
+        # then we artificially see a bump in memory usage. As per the following comment,
+        # we are going to be moving away from this function:
+        # https://github.com/pytorch/pytorch/pull/154174#discussion_r2105065931
+        c10d.barrier()
+
         # Rank 0 takes a snapshot before collective -- this snapshot should have
         # included rank 0's own context.
         if self.rank == 0:

diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
@@ -22,7 +22,10 @@
 )
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_BF16,
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+)
 from torch.testing._internal.common_device_type import (
     flex_attention_supported_platform as supported_platform,
     instantiate_device_type_tests,
@@ -1582,6 +1585,7 @@ def mask_mod(b, h, q, kv):
         self.assertEqual(out[:, :, M:, :].sum(), 0)
 
     @supported_platform
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_windowed_no_mask_vs_sdpa(self, device):
         score_mod = _generate_windowed(1000)
         attention = functools.partial(flex_attention, score_mod=score_mod)

diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
@@ -37,11 +37,15 @@
 )
 from torch._inductor.template_heuristics import CUDAConfigHeuristic, GemmConfig
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
+from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_WINDOWS,
     parametrize,
     TEST_WITH_ROCM,
+    MI300_ARCH,
+    runOnRocmArch,
+    skipIfXpu,
 )
 from torch.testing._internal.logging_utils import multiple_logs_to_string
 from torch.utils._triton import has_triton_tma_device
@@ -54,7 +58,6 @@
 from torch._inductor.virtualized import V
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
-from torch.testing._internal.common_utils import MI300_ARCH, runOnRocmArch, skipIfXpu
 from torch.testing._internal.inductor_utils import (
     get_func_call,
     get_kernel_launch,
@@ -804,6 +807,8 @@ def test_conv_backend(self):
 
         self.assertIn("NoValidChoicesError", str(context.exception))
 
+    # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
+    @largeTensorTest("30 GB", device=GPU_TYPE)
     def test_non_contiguous_input_mm(self):
         """
         Make sure the triton template can work with non-contiguous inputs without crash.
@@ -856,6 +861,8 @@ def f(x, y):
     # TODO: fix accuracy failure of the triton template on XPU.
     # and enable this test case.
     @skipIfXpu
+    # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
+    @largeTensorTest("30 GB", device=GPU_TYPE)
     def test_non_contiguous_input_mm_plus_mm(self):
         x1 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE)
         y1 = rand_strided((32768, 768), (768, 1), device=GPU_TYPE)

diff --git a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
@@ -260,7 +260,7 @@ typedef __half half;
 )";
 #endif
 
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) && ROCM_VERSION < 70000
 constexpr auto bfloat16_support_literal =
     R"(
 #ifndef __align__
@@ -317,6 +317,75 @@ __device__ __nv_bfloat16 __float2bfloat16(const float a) {
   return val;
 }
 
+__device__ float __bfloat162float(const __nv_bfloat16 a) {
+  union
+  {
+      uint32_t int32;
+      float    fp32;
+  } u = {uint32_t(a.__x) << 16};
+  return u.fp32;
+}
+#endif /* defined(__cplusplus) */
+)";
+#elif defined(USE_ROCM) && ROCM_VERSION >= 70000
+constexpr auto bfloat16_support_literal =
+    R"(
+#ifndef __align__
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+typedef unsigned int uint32_t;
+
+typedef struct __align__(2) {
+  unsigned short x;
+}
+__nv_bfloat16_raw;
+
+#if defined(__cplusplus)
+struct __align__(2) __nv_bfloat16 {
+  __host__ __device__ __nv_bfloat16() {}
+
+  __host__ __device__ __nv_bfloat16& operator=(const __nv_bfloat16_raw& hr) {
+    __x = hr.x;
+    return *this;
+  }
+
+  unsigned short __x;
+};
+
+__device__ unsigned short __internal_float2bfloat16(
+    const float f,
+    unsigned int& sign,
+    unsigned int& remainder) {
+  unsigned int x;
+
+  x = __float_as_uint(f);
+
+  if ((x & 0x7fffffffU) > 0x7f800000U) {
+    sign = 0U;
+    remainder = 0U;
+    return static_cast<unsigned short>(0x7fffU);
+  }
+  sign = x >> 31;
+  remainder = x << 16;
+  return static_cast<unsigned short>(x >> 16);
+}
+
+/* Definitions of intrinsics */
+__device__ __nv_bfloat16 __float2bfloat16(const float a) {
+  __nv_bfloat16 val;
+  __nv_bfloat16_raw r;
+  unsigned int sign;
+  unsigned int remainder;
+  r.x = __internal_float2bfloat16(a, sign, remainder);
+  if ((remainder > 0x80000000U) ||
+      ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+    r.x++;
+  }
+  val = r;
+  return val;
+}
+
 __device__ float __bfloat162float(const __nv_bfloat16 a) {
   union
   {