[FP8][cuBLAS][H100] only test fp32 outputs for rowwise _scaled_mm on H100 (pytorch#162022)

eqy · mansiag05 · commit 7ca9a3b0e1e3 · 2025-09-22T11:19:07.000+05:30
only cuBLAS supports float32 output and cuBLAS only supports rowwise for SM 9.0 Intended to land after pytorch#161305 Pull Request resolved: pytorch#162022 Approved by: https://github.com/ngimel
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
@@ -1530,22 +1530,34 @@ def test_scaled_mm_vs_emulated_row_wise(self, base_dtype):
         x_fp8 = to_fp8_saturated(x * x_scales, e4m3_type)
         y_fp8 = to_fp8_saturated(y * y_scales, e4m3_type)
 
-        # Calculate actual F8 mm
-        out_scaled_mm = mm_float8(
-            x_fp8, y_fp8, a_scale=x_scales, b_scale=y_scales, output_dtype=output_dtype
-        )
+        def test():
+            # Calculate actual F8 mm
+            out_scaled_mm = mm_float8(
+                x_fp8, y_fp8, a_scale=x_scales, b_scale=y_scales, output_dtype=output_dtype
+            )
 
-        # Calculate emulated F8 mm
-        out_emulated = mm_float8_emulated(
-            x_fp8, x_scales, y_fp8, y_scales, output_dtype
-        )
+            # Calculate emulated F8 mm
+            out_emulated = mm_float8_emulated(
+                x_fp8, x_scales, y_fp8, y_scales, output_dtype
+            )
 
-        if base_dtype in {torch.bfloat16, torch.float16}:
-            atol, rtol = 7e-2, 7e-2
-        else:
-            atol, rtol = 2e-3, 2e-3
+            if base_dtype in {torch.bfloat16, torch.float16}:
+                atol, rtol = 7e-2, 7e-2
+            else:
+                atol, rtol = 2e-3, 2e-3
 
-        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+            self.assertEqual(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+
+        # only cuBLAS supports rowwise with fp32 output and cuBLAS only supports
+        # rowwise on SM 9.0
+        if torch.cuda.get_device_capability != (9, 0) and output_dtype == torch.float:
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Only bf16 high precision output types are supported for row-wise scaling."
+            ):
+                test()
+        else:
+            test()
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
     @unittest.skipIf(not IS_SM90, "cuBLAS blockwise scaling requires sm90+")