From e299809adbee1eb97154f1bd942e7b383d6ac7e4 Mon Sep 17 00:00:00 2001 From: Dmitry Nikolaev Date: Thu, 14 Aug 2025 22:09:28 +0000 Subject: [PATCH] [release/2.6] unittest fixes for MI350 --- aten/src/ATen/native/cuda/SortStable.cu | 8 +++++--- test/quantization/core/test_quantized_op.py | 3 ++- test/test_matmul_cuda.py | 7 +++---- test/test_scatter_gather_ops.py | 2 +- test/test_sort_and_select.py | 10 +++++----- test/test_transformers.py | 2 ++ torch/_tensor_str.py | 1 + torch/cuda/__init__.py | 1 + 8 files changed, 20 insertions(+), 14 deletions(-) diff --git a/aten/src/ATen/native/cuda/SortStable.cu b/aten/src/ATen/native/cuda/SortStable.cu index 9e572a227fcd1..0b387febf59a9 100644 --- a/aten/src/ATen/native/cuda/SortStable.cu +++ b/aten/src/ATen/native/cuda/SortStable.cu @@ -226,8 +226,9 @@ void launch_stable_sort_kernel( return; } - int64_t numel_or_intmax = - std::min(numel, static_cast(std::numeric_limits::max())); + const int64_t intmax = static_cast(std::numeric_limits::max()); + // On ROCm, std::min -> ::min did not work as expected on when input values >= 2147483648 + int64_t numel_or_intmax = numel < intmax ? numel : intmax; int64_t nsort = self.size(dim); int64_t nbatch = (numel_or_intmax / nsort) * nsort; TORCH_CHECK(nbatch > 0, "Cannot sort dimension of length ", nsort); @@ -239,7 +240,8 @@ void launch_stable_sort_kernel( scalar_t* values_ptr = values.mutable_data_ptr(); int64_t remaining = numel; while (remaining > 0) { - int64_t n = std::min(remaining, nbatch); + // On ROCm, std::min -> ::min did not work as expected on when input values >= 2147483648 + int64_t n = remaining < nbatch ? remaining : nbatch; int64_t nsegments = n / nsort; if (nsegments == 1 || diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py index dd1df1e0cd982..c7fa5adedbc96 100644 --- a/test/quantization/core/test_quantized_op.py +++ b/test/quantization/core/test_quantized_op.py @@ -8,6 +8,7 @@ import random import sys import unittest +from packaging.version import Version from typing import NamedTuple, List import torch @@ -65,7 +66,7 @@ class PointwisePostOp(NamedTuple): def avoid_vpmaddubsw_overflow_linear( batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max ): - if sys.version_info >= (3, 13): + if Version(np.__version__) >= Version("2.1"): raise unittest.SkipTest("numpy 2.1 overflow error") for i, j in np.ndindex((batch_size, output_channels)): for k in range(0, input_channels // 2 * 2, 2): diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py index 804ba80a9cdcf..9da63650f3945 100644 --- a/test/test_matmul_cuda.py +++ b/test/test_matmul_cuda.py @@ -361,10 +361,9 @@ def test_float8_basics(self, device) -> None: self._test_tautological_mm(device, size=64, out_dtype=torch.float16) self._test_tautological_mm(device, size=96, out_dtype=torch.float32) - # hipblaslt does not yet support bfloat16 output - if torch.version.hip is None: - self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16) - with self.assertRaises(RuntimeError): + self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16) + + with self.assertRaises(AssertionError if torch.version.hip or device == "cpu" else RuntimeError): self._test_tautological_mm(device, out_dtype=e5m2_type) @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg) diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py index 555c0be18625a..39be20ca2fe05 100644 --- a/test/test_scatter_gather_ops.py +++ b/test/test_scatter_gather_ops.py @@ -158,7 +158,7 @@ def _test_scatter_base(self, fn, *, device, dtype, is_scalar, reduction, # When we are running opportunistic_fastatomics, we will expect some floating point rounding # errors as the order of operation is not guaranteed. if TEST_WITH_ROCM \ - and 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName \ + and torch.cuda.get_device_properties(0).gcnArchName[0:5] in ('gfx94', 'gfx95')\ and not torch.are_deterministic_algorithms_enabled(): self.assertEqual(actual, expected, atol=1e-9, rtol=1e-6) else: diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py index 6d37607ffbf19..85e1c317c40aa 100644 --- a/test/test_sort_and_select.py +++ b/test/test_sort_and_select.py @@ -209,21 +209,21 @@ def test_stable_sort(self, device, dtype): ) @onlyCUDA - @dtypes(torch.uint8) + @dtypes(torch.float16) @largeTensorTest("200GB") # Unfortunately 80GB A100 is not large enough def test_sort_large(self, device, dtype): t0 = torch.randperm(8192, device=device).to(dtype) t = t0.view(1, 8192).expand(2**18 + 1, -1).contiguous() v, i = t.sort() del t - iv, im = i.var_mean(dim=0) + iv, im = torch.var_mean(i.to(dtype), dim=0) del i - vv, vm = v.var_mean(dim=0) + vv, vm = torch.var_mean(v.to(dtype), dim=0) del v self.assertEqual(vv, torch.zeros_like(vv)) self.assertEqual(iv, torch.zeros_like(iv)) - self.assertEqual(vm, torch.arange(255, dtype=dtype, device=device)) - self.assertEqual(im, t0.sort().indices) + self.assertEqual(vm, torch.arange(8192, dtype=dtype, device=device)) + self.assertEqual(im, t0.sort().indices, exact_dtype=False) @dtypes(torch.float32) def test_sort_restride(self, device, dtype): diff --git a/test/test_transformers.py b/test/test_transformers.py index fa29f1866843d..9179757ded449 100644 --- a/test/test_transformers.py +++ b/test/test_transformers.py @@ -3302,6 +3302,8 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset, fudge_factors['grad_query'] = 650.0 if dtype == torch.float32: fudge_factors['grad_key'] = 90.0 + if "gfx95" in torch.cuda.get_device_properties(0).gcnArchName: + fudge_factors['grad_value'] = 12.0 check_out_and_grad( (out_ref, out_lp_ref, out), diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py index bc1a80a0e3e14..054688e34a7e3 100644 --- a/torch/_tensor_str.py +++ b/torch/_tensor_str.py @@ -344,6 +344,7 @@ def _tensor_str(self, indent): torch.float8_e5m2fnuz, torch.float8_e4m3fn, torch.float8_e4m3fnuz, + torch.float8_e8m0fnu, ]: self = self.half() diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 2cc0d99a73214..8e29f312f4516 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -1695,6 +1695,7 @@ def addmm_kernel_impl(*args, **kwargs): "is_bf16_supported", "is_current_stream_capturing", "is_initialized", + "is_tf32_supported", "jiterator", "list_gpu_processes", "make_graphed_callables",