Use get_fp8_constants from fp8_utils.py instead of fbgemm_gpu (#444)

yf225 · web-flow · commit 056ddb007c66 · 2025-09-18T11:42:15.000-07:00
diff --git a/tritonbench/operators/fp8_gemm_rowwise/aoti_fp8_triton_mm.py b/tritonbench/operators/fp8_gemm_rowwise/aoti_fp8_triton_mm.py
@@ -4,12 +4,10 @@
 import torch
 import triton
 import triton.language as tl
-
-from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
-    get_fp8_constants as get_fp8_constants,
-)
 from triton import Config
 
+from tritonbench.utils.fp8_utils import get_fp8_constants
+
 FP8_DTYPE, _, _, _ = get_fp8_constants()
 E4M3_MAX_POS: float = torch.finfo(FP8_DTYPE).max
 EPS: float = 1e-12
diff --git a/tritonbench/operators/fp8_gemm_rowwise/operator.py b/tritonbench/operators/fp8_gemm_rowwise/operator.py
@@ -69,9 +69,10 @@ def parse_args(args: List[str]) -> argparse.Namespace:
 HAS_CUTLASS_OR_CK = False
 HAS_CUBLAS = False
 
+from tritonbench.utils.fp8_utils import get_fp8_constants
+
 try:
     from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
-        get_fp8_constants as get_fp8_constants,
         matmul_fp8_row as triton_fp8_row,
     )
 
diff --git a/tritonbench/operators/fp8_gemm_rowwise_grouped/operator.py b/tritonbench/operators/fp8_gemm_rowwise_grouped/operator.py
@@ -157,14 +157,7 @@ def parse_args(args: List[str]) -> argparse.Namespace:
 HAS_TRITON = False
 HAS_CUTLASS_OR_CK = False
 
-# Try to import Triton GEMM module
-try:
-    from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
-        get_fp8_constants as get_fp8_constants,
-    )
-except (ImportError, AssertionError):
-    # If import fails, set HAS_TRITON to False
-    HAS_TRITON = False
+from tritonbench.utils.fp8_utils import get_fp8_constants
 
 # Try to import Triton grouped GEMM module
 try:
diff --git a/tritonbench/utils/fp8_utils.py b/tritonbench/utils/fp8_utils.py
@@ -0,0 +1,51 @@
+"""FP8 utilities for tritonbench operators."""
+
+import functools
+import os
+from typing import Tuple
+
+import torch
+import triton.language as tl
+
+
+@functools.lru_cache
+def supports_float8_fnuz(throw_on_hip_incompatibility: bool = True) -> bool:
+    if torch.version.hip:
+        device_capability = torch.cuda.get_device_capability()
+
+        if device_capability < (9, 4):
+            gpu_arch = torch.cuda.get_device_properties("cuda").gcnArchName
+            msg = f"Unsupported GPU arch: {gpu_arch} for FP8"
+            if throw_on_hip_incompatibility:
+                raise RuntimeError(msg)
+            else:
+                import logging
+
+                logging.error(msg)
+                return False
+
+        elif device_capability == (9, 4):
+            return True
+
+    return False
+
+
+def get_fp8_constants() -> Tuple[torch.dtype, tl.dtype, float, float]:
+    """
+    Helper function to get constant values for the current platform.
+
+    Returns:
+        pt_dtype (torch.dtype): The correct torch fp8 datatype.
+        tl_dtype (tl.dtype): The correct triton fp8 datatype.
+        max_fp8 (float): The maximum reprsentable value for the fp8 datatype.
+        eps (float): Minimum clip value to prevent divide by zero.
+    """
+    running_on_github: bool = os.getenv("GITHUB_ENV") is not None
+    if supports_float8_fnuz(throw_on_hip_incompatibility=(not running_on_github)):
+        pt_fp8_dtype = torch.float8_e4m3fnuz
+        tl_fp8_dtype = tl.float8e4b8
+    else:
+        pt_fp8_dtype = torch.float8_e4m3fn
+        tl_fp8_dtype = tl.float8e4nv
+
+    return pt_fp8_dtype, tl_fp8_dtype, torch.finfo(pt_fp8_dtype).max, 1e-12