[release/2.6] fix scaled matmul and test_float8_basics_cuda (#2739)

dnikolaev-amd · web-flow · commit d10296c10535 · 2025-10-28T10:34:40.000-07:00
This PR fixes: - test_matmul_cuda.py::TestFP8MatmulCudaCUDA::test_float8_basics_cuda - AssertionError: RuntimeError not raised - test_matmul_cuda.py::TestFP8MatmulCudaCUDA::test_scaled_mm_vs_emulated_row_wise_bfloat16_cuda - AssertionError: Tensor-likes are not close! need to swap A_SCALE and B_SCALE descriptors data if `use_rowwise` like as [HIPBLASLT_VEC_EXT](https://github.com/ROCm/pytorch/blob/78f6ff789a11bcdca072f019305485d1cf06c7eb/aten/src/ATen/cuda/CUDABlas.cpp#L1450-L1454) Fixes SWDEV-544098
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -1447,6 +1447,12 @@ void scaled_gemm(
 #if defined(USE_ROCM)
 #if defined(HIPBLASLT_OUTER_VEC)
   // this case is handled later as hipified CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F
+  if (use_rowwise) {
+    // swapped
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat2_scale_ptr);
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat1_scale_ptr);
+  }
+  else
 #elif defined(HIPBLASLT_VEC_EXT)
   if (use_rowwise) {
     // swapped
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: linear algebra"]
 
+from contextlib import nullcontext
 import unittest
 from itertools import product
 from functools import partial
@@ -356,7 +357,8 @@ def test_float8_basics(self, device) -> None:
             self._test_tautological_mm(device, e4m3_type, e5m2_type, size=32)
             self._test_tautological_mm(device, e5m2_type, e4m3_type, size=48)
         # According to https://docs.nvidia.com/cuda/cublas/#id99 8F_E5M2 MM is unsupported
-        with self.assertRaises(RuntimeError):
+        # supported on ROCm but fails on CUDA
+        with self.assertRaises(RuntimeError) if torch.version.hip is None else nullcontext():
             self._test_tautological_mm(device, e5m2_type, e5m2_type)
 
         self._test_tautological_mm(device, size=64, out_dtype=torch.float16)