Split out fast gemv test into its own class (#4852)

cthi · facebook-github-bot · commit 23f944ca89ab · 2025-09-10T12:31:05.000-07:00
Summary: Pull Request resolved: #4852 X-link: facebookresearch/FBGEMM#1878 Fast gemv tests are broken, not sure since when. Split them into their own class so we could avoid running them later in the new OSS CI. The numerics could be suspect potentially, some of the atol is quite high for a small number of values, but didn't debug it. Since this is not used, let's ignore for now. Wiht this change we only have 1 broken tests in FP8Tests, apparently stochastic rounding is broken for FP8 Rowwise.. Reviewed By: q10 Differential Revision: D82115129 fbshipit-source-id: 03aae5eda14635fa3372665529508e98eabdde61
diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
@@ -58,10 +58,20 @@ def evaluate_platform_supports_fp8():
 
 def evaluate_platform_supports_mxfp8():
     if torch.cuda.is_available():
+        if torch.version.hip:
+            return False
         return torch.cuda.get_device_capability() >= (10, 0)
     return False
 
 
+def evaluate_cuda_platform_version(major: int):
+    if torch.version.cuda:
+        return torch.cuda.get_device_capability() >= (major, 0)
+    return False
+
+
+SM90_OR_LATER = evaluate_cuda_platform_version(9)
+
 SUPPORTS_FP8 = evaluate_platform_supports_fp8()
 
 SUPPORTS_MXFP8 = evaluate_platform_supports_mxfp8()
@@ -1898,8 +1908,73 @@ def test_quantize_compile(self) -> None:
             torch.compile(torch.ops.fbgemm.bf16_fast_gemv)(X, W_bf16)
 
     @unittest.skipIf(
-        not torch.version.cuda, "Skip on AMD: fast gemv op is not yet supported."
+        torch.version.hip, "Skip on AMD: cuda quantize op is yet supported."
+    )
+    @settings(deadline=None)
+    @given(
+        K=st.sampled_from([0, 128]),
     )
+    def test_quantize_zero_input(self, K) -> None:
+        w = torch.randn(
+            size=(0, K),
+            dtype=torch.bfloat16,
+            device=self.device,
+        )
+        w_scale_ref = torch.empty(
+            size=(0,),
+            dtype=torch.float32,
+            device=self.device,
+        )
+        wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
+        torch.testing.assert_close(w.shape, wq.shape)
+        torch.testing.assert_close(w_scale.shape, w_scale_ref.shape)
+
+    @unittest.skipIf(torch.version.hip, "Skip on AMD: fp8 lite op is yet suported.")
+    @settings(deadline=None)
+    @given(
+        M=st.sampled_from([1, 4]),
+        N=st.sampled_from([1024, 6144]),
+        K=st.sampled_from([512, 3584]),
+        CudaGraph=st.sampled_from([True, False]),
+    )
+    def test_fp8_lite_matmul(self, M: int, N: int, K: int, CudaGraph: bool) -> None:
+        x = (
+            torch.randn(
+                size=(M, K),
+                dtype=torch.bfloat16,
+                device=self.device,
+            )
+            * 0.1
+        )
+        w = (
+            torch.randn(
+                size=(N, K),
+                dtype=torch.bfloat16,
+                device=self.device,
+            )
+            * 0.01
+        )
+        xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(x)
+        wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(w)
+        if CudaGraph:
+            zq = torch.ops.fbgemm.f8f8bf16_lite(xq, wq, x_scale * w_scale)
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                zq = torch.ops.fbgemm.f8f8bf16_lite(xq, wq, x_scale * w_scale)
+            g.replay()
+        else:
+            zq = torch.ops.fbgemm.f8f8bf16_lite(xq, wq, x_scale * w_scale)
+        zq_ref = (x @ w.T).to(torch.bfloat16)
+        torch.testing.assert_close(zq, zq_ref, atol=9.0e-2, rtol=9.0e-2)
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Skip when GPU is not available")
+@unittest.skipIf(not SM90_OR_LATER, "Skip when not SM90+")
+class FastGemvTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.device = torch.accelerator.current_accelerator()
+
     def run_gemv(
         self, test_cases, gemv_op, atol, rtol, quantize_w=False, quantize_x=False
     ):
@@ -1933,9 +2008,6 @@ def run_gemv(
             z_ref = (x @ w.T).to(torch.bfloat16).to(self.device)
             torch.testing.assert_close(z, z_ref, atol=atol, rtol=rtol)
 
-    @unittest.skipIf(
-        not torch.version.cuda, "Skip on AMD: fast gemv op is not yet supported."
-    )
     def run_gemv_batched(self, test_cases, gemv_op, atol, rtol):
         for B, M, N, K in test_cases:
             x = (
@@ -1964,9 +2036,6 @@ def run_gemv_batched(self, test_cases, gemv_op, atol, rtol):
             z_ref = torch.bmm(x, w.transpose(1, 2)).to(torch.bfloat16).to(self.device)
             torch.testing.assert_close(z, z_ref, atol=atol, rtol=rtol)
 
-    @unittest.skipIf(
-        not torch.version.cuda, "Skip on AMD: fast gemv op is not yet supported."
-    )
     def test_bf16_gemv(self) -> None:
         test_cases = [
             (1, 128, 256),
@@ -1990,9 +2059,6 @@ def test_bf16_gemv(self) -> None:
         ]
         self.run_gemv(test_cases, torch.ops.fbgemm.bf16_fast_gemv, 9.0e-3, 9.0e-3)
 
-    @unittest.skipIf(
-        not torch.version.cuda, "Skip on AMD: fast gemv op is not yet supported."
-    )
     def test_bf16_fp8_gemv(self) -> None:
         test_cases = [
             (1, 1280, 8192),
@@ -2016,9 +2082,6 @@ def test_bf16_fp8_gemv(self) -> None:
             quantize_w=True,
         )
 
-    @unittest.skipIf(
-        not torch.version.cuda, "Skip on AMD: fast gemv op is not yet supported."
-    )
     def test_fp8_fp8_gemv(self) -> None:
         test_cases = [
             (1, 1280, 8192),
@@ -2055,9 +2118,6 @@ def test_fp8_fp8_gemv(self) -> None:
             quantize_x=True,
         )
 
-    @unittest.skipIf(
-        not torch.version.cuda, "Skip on AMD: fast gemv op is not yet supported."
-    )
     def test_fp8_gemv_batched(self) -> None:
         test_cases = [
             (2, 1, 4096, 5120),
@@ -2082,66 +2142,6 @@ def test_fp8_gemv_batched(self) -> None:
             1.0e-1,
         )
 
-    @unittest.skipIf(
-        torch.version.hip, "Skip on AMD: cuda quantize op is yet supported."
-    )
-    @settings(deadline=None)
-    @given(
-        K=st.sampled_from([0, 128]),
-    )
-    def test_quantize_zero_input(self, K) -> None:
-        w = torch.randn(
-            size=(0, K),
-            dtype=torch.bfloat16,
-            device=self.device,
-        )
-        w_scale_ref = torch.empty(
-            size=(0,),
-            dtype=torch.float32,
-            device=self.device,
-        )
-        wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_row(w)
-        torch.testing.assert_close(w.shape, wq.shape)
-        torch.testing.assert_close(w_scale.shape, w_scale_ref.shape)
-
-    @unittest.skipIf(torch.version.hip, "Skip on AMD: fp8 lite op is yet suported.")
-    @settings(deadline=None)
-    @given(
-        M=st.sampled_from([1, 4]),
-        N=st.sampled_from([1024, 6144]),
-        K=st.sampled_from([512, 3584]),
-        CudaGraph=st.sampled_from([True, False]),
-    )
-    def test_fp8_lite_matmul(self, M: int, N: int, K: int, CudaGraph: bool) -> None:
-        x = (
-            torch.randn(
-                size=(M, K),
-                dtype=torch.bfloat16,
-                device=self.device,
-            )
-            * 0.1
-        )
-        w = (
-            torch.randn(
-                size=(N, K),
-                dtype=torch.bfloat16,
-                device=self.device,
-            )
-            * 0.01
-        )
-        xq, x_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(x)
-        wq, w_scale = torch.ops.fbgemm.quantize_fp8_per_tensor(w)
-        if CudaGraph:
-            zq = torch.ops.fbgemm.f8f8bf16_lite(xq, wq, x_scale * w_scale)
-            g = torch.cuda.CUDAGraph()
-            with torch.cuda.graph(g):
-                zq = torch.ops.fbgemm.f8f8bf16_lite(xq, wq, x_scale * w_scale)
-            g.replay()
-        else:
-            zq = torch.ops.fbgemm.f8f8bf16_lite(xq, wq, x_scale * w_scale)
-        zq_ref = (x @ w.T).to(torch.bfloat16)
-        torch.testing.assert_close(zq, zq_ref, atol=9.0e-2, rtol=9.0e-2)
-
 
 @unittest.skipIf(
     not torch.cuda.is_available() or torch.version.hip,