ROCm · jithunnair-amd · Aug 25, 2025 · Aug 14, 2025
diff --git a/aten/src/ATen/native/cuda/SortStable.cu b/aten/src/ATen/native/cuda/SortStable.cu
@@ -226,8 +226,9 @@ void launch_stable_sort_kernel(
     return;
   }
 
-  int64_t numel_or_intmax =
-      std::min(numel, static_cast<int64_t>(std::numeric_limits<int>::max()));
+  const int64_t intmax = static_cast<int64_t>(std::numeric_limits<int>::max());
+  // On ROCm, std::min -> ::min did not work as expected on when input values >= 2147483648
+  int64_t numel_or_intmax = numel < intmax ? numel : intmax;
   int64_t nsort = self.size(dim);
   int64_t nbatch = (numel_or_intmax / nsort) * nsort;
   TORCH_CHECK(nbatch > 0, "Cannot sort dimension of length ", nsort);
@@ -239,7 +240,8 @@ void launch_stable_sort_kernel(
         scalar_t* values_ptr = values.mutable_data_ptr<scalar_t>();
         int64_t remaining = numel;
         while (remaining > 0) {
-          int64_t n = std::min(remaining, nbatch);
+          // On ROCm, std::min -> ::min did not work as expected on when input values >= 2147483648
+          int64_t n = remaining < nbatch ? remaining : nbatch;
           int64_t nsegments = n / nsort;
 
           if (nsegments == 1 ||

diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
@@ -8,6 +8,7 @@
 import random
 import sys
 import unittest
+from packaging.version import Version
 from typing import NamedTuple, List
 
 import torch
@@ -65,7 +66,7 @@ class PointwisePostOp(NamedTuple):
 def avoid_vpmaddubsw_overflow_linear(
     batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max
 ):
-    if sys.version_info >= (3, 13):
+    if Version(np.__version__) >= Version("2.1"):
         raise unittest.SkipTest("numpy 2.1 overflow error")
     for i, j in np.ndindex((batch_size, output_channels)):
         for k in range(0, input_channels // 2 * 2, 2):

diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
@@ -361,10 +361,9 @@ def test_float8_basics(self, device) -> None:
 
         self._test_tautological_mm(device, size=64, out_dtype=torch.float16)
         self._test_tautological_mm(device, size=96, out_dtype=torch.float32)
-        # hipblaslt does not yet support bfloat16 output
-        if torch.version.hip is None:
-            self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16)
-        with self.assertRaises(RuntimeError):
+        self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16)
+
+        with self.assertRaises(AssertionError if torch.version.hip or device == "cpu" else RuntimeError):
             self._test_tautological_mm(device, out_dtype=e5m2_type)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)

diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py
@@ -158,7 +158,7 @@ def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction,
             # When we are running opportunistic_fastatomics, we will expect some floating point rounding
             # errors as the order of operation is not guaranteed.
             if TEST_WITH_ROCM \
-                    and 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName \
+                    and torch.cuda.get_device_properties(0).gcnArchName[0:5] in ('gfx94', 'gfx95')\
                     and not torch.are_deterministic_algorithms_enabled():
                 self.assertEqual(actual, expected, atol=1e-9, rtol=1e-6)
             else:

diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
@@ -209,21 +209,21 @@ def test_stable_sort(self, device, dtype):
             )
 
     @onlyCUDA
-    @dtypes(torch.uint8)
+    @dtypes(torch.float16)
     @largeTensorTest("200GB")  # Unfortunately 80GB A100 is not large enough
     def test_sort_large(self, device, dtype):
         t0 = torch.randperm(8192, device=device).to(dtype)
         t = t0.view(1, 8192).expand(2**18 + 1, -1).contiguous()
         v, i = t.sort()
         del t
-        iv, im = i.var_mean(dim=0)
+        iv, im = torch.var_mean(i.to(dtype), dim=0)
         del i
-        vv, vm = v.var_mean(dim=0)
+        vv, vm = torch.var_mean(v.to(dtype), dim=0)
         del v
         self.assertEqual(vv, torch.zeros_like(vv))
         self.assertEqual(iv, torch.zeros_like(iv))
-        self.assertEqual(vm, torch.arange(255, dtype=dtype, device=device))
-        self.assertEqual(im, t0.sort().indices)
+        self.assertEqual(vm, torch.arange(8192, dtype=dtype, device=device))
+        self.assertEqual(im, t0.sort().indices, exact_dtype=False)
 
     @dtypes(torch.float32)
     def test_sort_restride(self, device, dtype):

diff --git a/test/test_transformers.py b/test/test_transformers.py
@@ -3302,6 +3302,8 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
                 fudge_factors['grad_query'] = 650.0
             if dtype == torch.float32:
                 fudge_factors['grad_key'] = 90.0
+                if "gfx95" in torch.cuda.get_device_properties(0).gcnArchName:
+                    fudge_factors['grad_value'] = 12.0
 
         check_out_and_grad(
             (out_ref, out_lp_ref, out),

diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
@@ -344,6 +344,7 @@ def _tensor_str(self, indent):
         torch.float8_e5m2fnuz,
         torch.float8_e4m3fn,
         torch.float8_e4m3fnuz,
+        torch.float8_e8m0fnu,
     ]:
         self = self.half()
 

diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
@@ -1695,6 +1695,7 @@ def addmm_kernel_impl(*args, **kwargs):
     "is_bf16_supported",
     "is_current_stream_capturing",
     "is_initialized",
+    "is_tf32_supported",
     "jiterator",
     "list_gpu_processes",
     "make_graphed_callables",