gemmbench: Add support for four FP8 types and test one in CI (#78)

andfau-amd · web-flow · commit c31afbaefba8 · 2025-04-25T17:45:56.000+02:00
The added types are ones supported by both MLIR and IREE, and with some
AMD GPU support: f8E4M3FNUZ and f8E5M2FNUZ are supported on CDNA3,
whereas f8E4M3FN and f8E5M2 are supported on RDNA4. f8E4M3FNUZ is tested
in CI (on MI300).
diff --git a/.github/workflows/run_bench.yml b/.github/workflows/run_bench.yml
@@ -56,6 +56,11 @@ jobs:
           source bench_venv/bin/activate
           python -m iree_kernel_benchmark.gemmbench --dtypes f16
 
+      - name: GEMM FP8 (f8E4M3FNUZ)
+        run: |
+          source bench_venv/bin/activate
+          python -m iree_kernel_benchmark.gemmbench --dtypes f8E4M3FNUZ
+
       - name: GEMM I8
         run: |
           source bench_venv/bin/activate
diff --git a/iree_kernel_benchmark/gemmbench/__main__.py b/iree_kernel_benchmark/gemmbench/__main__.py
@@ -62,7 +62,7 @@ def compile_gemm(
         "--dtypes",
         nargs="+",
         default=[],
-        help="List of data types to generate benchmarks for. Defaults to f16. Other options include f32, bf16, i8.",
+        help="List of data types to generate benchmarks for. Defaults to f16. Other options include (for example) f32, bf16, i8, f8E4M3FNUZ.",
     )
     parser.add_argument(
         "--raw_accumulators",
diff --git a/iree_kernel_benchmark/gemmbench/gemm_utils.py b/iree_kernel_benchmark/gemmbench/gemm_utils.py
@@ -25,6 +25,21 @@
 from iree.compiler.dialects import arith, func, linalg, tensor
 
 
+def num_bytes(dtype: str) -> int:
+    dtype_to_bytes = {
+        "f32": 4,
+        "f16": 2,
+        "bf16": 2,
+        "f8E4M3FNUZ": 1,
+        "f8E5M2FNUZ": 1,
+        "f8E4M3FN": 1,
+        "f8E5M2": 1,
+        "i8": 1,
+        "i32": 4,
+    }
+    return dtype_to_bytes[dtype]
+
+
 @dataclass
 class GemmConfig:
     M: int
@@ -58,16 +73,8 @@ def get_out(self) -> str:
         return f"{self.M}x{self.N}x{self.result_element_type}"
 
     def get_byte_count(self) -> int:
-        dtype_to_bytes = {
-            "f32": 4,
-            "f16": 2,
-            "bf16": 2,
-            "f8E4M3FNUZ": 1,
-            "i8": 1,
-            "i32": 4,
-        }
-        operand_bytes_per_element = dtype_to_bytes[self.operand_element_type]
-        result_bytes_per_element = dtype_to_bytes[self.result_element_type]
+        operand_bytes_per_element = num_bytes(self.operand_element_type)
+        result_bytes_per_element = num_bytes(self.result_element_type)
         byte_count_input = (self.M + self.N) * self.K * operand_bytes_per_element
         byte_count_output = (self.M * self.N) * result_bytes_per_element
         return byte_count_input + byte_count_output
@@ -83,6 +90,10 @@ def _convert_dtype_to_mlir(dtype: str) -> ir.Type:
         "i16": lambda: ir.IntegerType.get_signless(16),
         "i32": lambda: ir.IntegerType.get_signless(32),
         "i64": lambda: ir.IntegerType.get_signless(64),
+        "f8E4M3FNUZ": lambda: ir.Float8E4M3FNUZType.get(),
+        "f8E5M2FNUZ": lambda: ir.Float8E5M2FNUZType.get(),
+        "f8E4M3FN": lambda: ir.Float8E4M3FNType.get(),
+        "f8E5M2": lambda: ir.Float8E5M2Type.get(),
         "f16": lambda: ir.F16Type.get(),
         "f32": lambda: ir.F32Type.get(),
         "f64": lambda: ir.F64Type.get(),
diff --git a/iree_kernel_benchmark/gemmbench/problems.py b/iree_kernel_benchmark/gemmbench/problems.py
@@ -4,19 +4,23 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from .gemm_utils import GemmConfig
+from .gemm_utils import GemmConfig, num_bytes
 
 import re
 
 
-def num_bytes(dtype: str) -> int:
-    return {"f16": 2, "bf16": 2, "f32": 4, "i8": 1, "i32": 4}[dtype]
-
-
 def get_default_accumulator_element_type(operand_element_type: str) -> str:
-    return {"f16": "f32", "bf16": "f32", "f32": "f32", "i8": "i32", "i32": "i32"}[
-        operand_element_type
-    ]
+    return {
+        "f16": "f32",
+        "bf16": "f32",
+        "f32": "f32",
+        "f8E4M3FNUZ": "f32",
+        "f8E5M2FNUZ": "f32",
+        "f8E4M3FN": "f32",
+        "f8E5M2": "f32",
+        "i8": "i32",
+        "i32": "i32",
+    }[operand_element_type]
 
 
 def get_default_result_element_type(
diff --git a/tests/test_gemmbench_mlir_gen.py b/tests/test_gemmbench_mlir_gen.py
@@ -41,6 +41,34 @@ def test_n_t_f16_f32_f16():
     )
 
 
+def test_n_t_f8_f32_f8():
+    # From 'llama8b_prefill' (f8 version is synthetic)
+    cfg = GemmConfig(
+        M=512,
+        N=4096,
+        K=14336,
+        tA="N",
+        tB="T",
+        operand_element_type="f8E4M3FNUZ",
+        accumulator_element_type="f32",
+        result_element_type="f8E4M3FNUZ",
+    )
+    mlir = generate_mlir(cfg)
+    match_lines(
+        mlir,
+        [
+            "module {",
+            "func.func @main(%arg0: tensor<512x14336xf8E4M3FNUZ>, %arg1: tensor<4096x14336xf8E4M3FNUZ>) -> tensor<512x4096xf8E4M3FNUZ> {",
+            "%cst = arith.constant 0.000000e+00 : f32",
+            "%0 = tensor.empty() : tensor<512x4096xf32>",
+            "%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<512x4096xf32>) -> tensor<512x4096xf32>",
+            "%2 = linalg.matmul_transpose_b {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<512x14336xf8E4M3FNUZ>, tensor<4096x14336xf8E4M3FNUZ>) outs(%1 : tensor<512x4096xf32>) -> tensor<512x4096xf32>",
+            "%3 = arith.truncf %2 : tensor<512x4096xf32> to tensor<512x4096xf8E4M3FNUZ>",
+            "return %3 : tensor<512x4096xf8E4M3FNUZ>",
+        ],
+    )
+
+
 def test_n_t_bf16_f32_bf16():
     # From 'llama70bmemory'
     cfg = GemmConfig(

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ def compile_gemm(`
`62`	`62`	`"--dtypes",`
`63`	`63`	`nargs="+",`
`64`	`64`	`default=[],`
`65`		`- help="List of data types to generate benchmarks for. Defaults to f16. Other options include f32, bf16, i8.",`
	`65`	`+ help="List of data types to generate benchmarks for. Defaults to f16. Other options include (for example) f32, bf16, i8, f8E4M3FNUZ.",`
`66`	`66`	`)`
`67`	`67`	`parser.add_argument(`
`68`	`68`	`"--raw_accumulators",`