gemmbench: Add support for dynamic dims, use for llama8b_prefill (#80)

andfau-amd · web-flow · commit f695efc719c5 · 2025-04-28T09:52:27.000+02:00
Dynamic dims seem to be appropriate for llama8b_prefill based on samples from https://github.com/nod-ai/playbook/issues/63
diff --git a/iree_kernel_benchmark/gemmbench/__main__.py b/iree_kernel_benchmark/gemmbench/__main__.py
@@ -252,6 +252,7 @@ def compile_gemm(
                 config.operand_element_type,
                 config.tA,
                 config.tB,
+                f"D={config.runtime_dim}" if config.runtime_dim is not None else "",
                 round(benchmark_gemm_mean_time_us, 4),
                 round(arithmetic_intensity, 4),
                 round(tflops_per_second, 4),
@@ -271,6 +272,7 @@ def compile_gemm(
         "dtype",
         "tA",
         "tB",
+        "runtime_dim",
         "mean_microseconds",
         "arithmetic_intensity",
         "tflops",
diff --git a/iree_kernel_benchmark/gemmbench/gemm_utils.py b/iree_kernel_benchmark/gemmbench/gemm_utils.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Tuple
 
 try:
     import iree.turbine.kernel as tk
@@ -24,6 +24,8 @@
 from iree.compiler import ir
 from iree.compiler.dialects import arith, func, linalg, tensor
 
+kDynamic = ir.ShapedType.get_dynamic_size()
+
 
 def num_bytes(dtype: str) -> int:
     dtype_to_bytes = {
@@ -42,6 +44,7 @@ def num_bytes(dtype: str) -> int:
 
 @dataclass
 class GemmConfig:
+    # Note that M, N and K may be set to kDynamic, a special value
     M: int
     N: int
     K: int
@@ -50,37 +53,62 @@ class GemmConfig:
     operand_element_type: str
     accumulator_element_type: str
     result_element_type: str
+    # runtime_dim subtitutes for any dynamic dims when executing.
+    # TODO: It would be better if we could execute the same compiled dynamic
+    #       kernel for a series of different sizes, rather than duplicating the
+    #       GemmConfig. The current design's advantage is that no changes have
+    #       to be made to the execution logic (looks just like a static shape).
+    runtime_dim: Optional[int] = None
 
     def get_name(self) -> str:
-        name = f"gemm_{self.M}_{self.N}_{self.K}_{self.operand_element_type}_{self.accumulator_element_type}"
+        M = self.M if self.M != kDynamic else "D"
+        N = self.N if self.N != kDynamic else "D"
+        K = self.K if self.K != kDynamic else "D"
+        name = f"gemm_{M}_{N}_{K}_{self.operand_element_type}_{self.accumulator_element_type}"
         if self.tA == "T":
             name += "_tA"
         elif self.tB == "T":
             name += "_tB"
+        if self.runtime_dim is not None:
+            name += f"_D={self.runtime_dim}"
         return name
 
+    def get_runtime_dims(self) -> Tuple[int, int, int]:
+        """
+        Get concrete dims to use when executing this kernel.
+        """
+        M = self.M if self.M != kDynamic else self.runtime_dim
+        N = self.N if self.N != kDynamic else self.runtime_dim
+        K = self.K if self.K != kDynamic else self.runtime_dim
+        return M, N, K
+
     def get_inp1(self) -> str:
+        M, N, K = self.get_runtime_dims()
         if self.tA == "T":
-            return f"{self.K}x{self.M}x{self.operand_element_type}"
-        return f"{self.M}x{self.K}x{self.operand_element_type}"
+            return f"{K}x{M}x{self.operand_element_type}"
+        return f"{M}x{K}x{self.operand_element_type}"
 
     def get_inp2(self) -> str:
+        M, N, K = self.get_runtime_dims()
         if self.tB == "T":
-            return f"{self.N}x{self.K}x{self.operand_element_type}"
-        return f"{self.K}x{self.N}x{self.operand_element_type}"
+            return f"{N}x{K}x{self.operand_element_type}"
+        return f"{K}x{N}x{self.operand_element_type}"
 
     def get_out(self) -> str:
-        return f"{self.M}x{self.N}x{self.result_element_type}"
+        M, N, K = self.get_runtime_dims()
+        return f"{M}x{N}x{self.result_element_type}"
 
     def get_byte_count(self) -> int:
         operand_bytes_per_element = num_bytes(self.operand_element_type)
         result_bytes_per_element = num_bytes(self.result_element_type)
-        byte_count_input = (self.M + self.N) * self.K * operand_bytes_per_element
-        byte_count_output = (self.M * self.N) * result_bytes_per_element
+        M, N, K = self.get_runtime_dims()
+        byte_count_input = (M + N) * K * operand_bytes_per_element
+        byte_count_output = (M * N) * result_bytes_per_element
         return byte_count_input + byte_count_output
 
     def get_flops(self) -> int:
-        flops = 2 * self.M * self.N * self.K
+        M, N, K = self.get_runtime_dims()
+        flops = 2 * M * N * K
         return flops
 
 
@@ -123,16 +151,22 @@ def generate_mlir(config: GemmConfig):
         # Transpose A
         if tA == "T":
             arg0_type = ir.RankedTensorType.get([K, M], operand_element_type)
+            arg0_M_idx = 1
             arg1_type = ir.RankedTensorType.get([K, N], operand_element_type)
+            arg1_N_idx = 1
         # Transpose B
         elif tB == "T":
             arg0_type = ir.RankedTensorType.get([M, K], operand_element_type)
+            arg0_M_idx = 0
             arg1_type = ir.RankedTensorType.get([N, K], operand_element_type)
+            arg1_N_idx = 0
         # "Normal" path (can't transpose both)
         else:
             assert tA == "N" and tB == "N"
             arg0_type = ir.RankedTensorType.get([M, K], operand_element_type)
+            arg0_M_idx = 0
             arg1_type = ir.RankedTensorType.get([K, N], operand_element_type)
+            arg1_N_idx = 1
         result_type = ir.RankedTensorType.get([M, N], result_element_type)
 
         module = ir.Module.create()
@@ -143,7 +177,24 @@ def main(arg0, arg1):
                 zero_element = arith.constant(
                     value=literal_zero, result=acc_element_type
                 )
-                empty_tensor = tensor.empty(element_type=acc_element_type, sizes=[M, N])
+                if M == kDynamic:
+                    M_dynamic_dim_idx = arith.constant(
+                        value=arg0_M_idx, result=ir.IndexType.get()
+                    )
+                    M_dynamic_dim = tensor.dim(arg0, M_dynamic_dim_idx)
+                if N == kDynamic:
+                    N_dynamic_dim_idx = arith.constant(
+                        value=arg1_N_idx, result=ir.IndexType.get()
+                    )
+                    N_dynamic_dim = tensor.dim(arg1, N_dynamic_dim_idx)
+
+                empty_tensor = tensor.empty(
+                    element_type=acc_element_type,
+                    sizes=[
+                        M_dynamic_dim if M == kDynamic else M,
+                        N_dynamic_dim if N == kDynamic else N,
+                    ],
+                )
                 filled_tensor = linalg.fill(zero_element, outs=[empty_tensor])
 
                 if tA == "T":
diff --git a/iree_kernel_benchmark/gemmbench/problems.py b/iree_kernel_benchmark/gemmbench/problems.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from .gemm_utils import GemmConfig, num_bytes
+from .gemm_utils import GemmConfig, num_bytes, kDynamic
 
 import re
 
@@ -704,14 +704,15 @@ def llama8b_prefill(dtype: str, raw_accumulators: bool) -> list[GemmConfig]:
         if model == "8b_prefill":
             configs.append(
                 GemmConfig(
-                    m,
+                    kDynamic,
                     n,
                     k,
                     "N",
                     "T",
                     dtype,
                     get_default_accumulator_element_type(dtype),
                     get_default_result_element_type(dtype, raw_accumulators),
+                    runtime_dim=m,
                 )
             )
     return configs
diff --git a/tests/test_gemmbench_mlir_gen.py b/tests/test_gemmbench_mlir_gen.py
@@ -1,4 +1,8 @@
-from iree_kernel_benchmark.gemmbench.gemm_utils import GemmConfig, generate_mlir
+from iree_kernel_benchmark.gemmbench.gemm_utils import (
+    GemmConfig,
+    generate_mlir,
+    kDynamic,
+)
 from .utils import match_lines
 from iree.compiler import ir
 import pytest
@@ -69,6 +73,130 @@ def test_n_t_f8_f32_f8():
     )
 
 
+def test_n_t_f16_f32_f16_dynamic_dim_M():
+    # From 'llama8b_prefill'
+    cfg = GemmConfig(
+        M=kDynamic,
+        N=14336,
+        K=4096,
+        tA="N",
+        tB="T",
+        operand_element_type="f16",
+        accumulator_element_type="f32",
+        result_element_type="f16",
+        runtime_dim=512,  # Unused, included for correctness
+    )
+    mlir = generate_mlir(cfg)
+    match_lines(
+        mlir,
+        [
+            "module {",
+            "func.func @main(%arg0: tensor<?x4096xf16>, %arg1: tensor<14336x4096xf16>) -> tensor<?x14336xf16> {",
+            "%cst = arith.constant 0.000000e+00 : f32",
+            "%c0 = arith.constant 0 : index",
+            "%dim = tensor.dim %arg0, %c0 : tensor<?x4096xf16>",
+            "%0 = tensor.empty(%dim) : tensor<?x14336xf32>",
+            "%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x14336xf32>) -> tensor<?x14336xf32>",
+            "%2 = linalg.matmul_transpose_b {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<?x4096xf16>, tensor<14336x4096xf16>) outs(%1 : tensor<?x14336xf32>) -> tensor<?x14336xf32>",
+            "%3 = arith.truncf %2 : tensor<?x14336xf32> to tensor<?x14336xf16>",
+            "return %3 : tensor<?x14336xf16>",
+        ],
+    )
+
+
+def test_t_n_f16_f32_f16_dynamic_dim_N():
+    # Synthetic example (modified from test_n_t_f16_f32_f16_dynamic_dim_M)
+    cfg = GemmConfig(
+        M=512,
+        N=kDynamic,
+        K=4096,
+        tA="T",
+        tB="N",
+        operand_element_type="f16",
+        accumulator_element_type="f32",
+        result_element_type="f16",
+        runtime_dim=14366,  # Unused, included for correctness
+    )
+    mlir = generate_mlir(cfg)
+    match_lines(
+        mlir,
+        [
+            "module {",
+            "func.func @main(%arg0: tensor<4096x512xf16>, %arg1: tensor<4096x?xf16>) -> tensor<512x?xf16> {",
+            "%cst = arith.constant 0.000000e+00 : f32",
+            "%c1 = arith.constant 1 : index",
+            "%dim = tensor.dim %arg1, %c1 : tensor<4096x?xf16>",
+            "%0 = tensor.empty(%dim) : tensor<512x?xf32>",
+            "%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<512x?xf32>) -> tensor<512x?xf32>",
+            "%2 = linalg.matmul_transpose_a {cast = #linalg.type_fn<cast_signed>} ins(%arg0, %arg1 : tensor<4096x512xf16>, tensor<4096x?xf16>) outs(%1 : tensor<512x?xf32>) -> tensor<512x?xf32>",
+            "%3 = arith.truncf %2 : tensor<512x?xf32> to tensor<512x?xf16>",
+            "return %3 : tensor<512x?xf16>",
+        ],
+    )
+
+
+def test_n_n_f16_f32_f16_dynamic_dim_K():
+    # Synthetic example (modified from test_n_t_f16_f32_f16_dynamic_dim_M)
+    cfg = GemmConfig(
+        M=512,
+        N=14366,
+        K=kDynamic,
+        tA="N",
+        tB="N",
+        operand_element_type="f16",
+        accumulator_element_type="f32",
+        result_element_type="f16",
+        runtime_dim=4096,  # Unused, included for correctness
+    )
+    mlir = generate_mlir(cfg)
+    match_lines(
+        mlir,
+        [
+            "module {",
+            "func.func @main(%arg0: tensor<512x?xf16>, %arg1: tensor<?x14366xf16>) -> tensor<512x14366xf16> {",
+            "%cst = arith.constant 0.000000e+00 : f32",
+            "%0 = tensor.empty() : tensor<512x14366xf32>",
+            "%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<512x14366xf32>) -> tensor<512x14366xf32>",
+            "%2 = linalg.matmul ins(%arg0, %arg1 : tensor<512x?xf16>, tensor<?x14366xf16>) outs(%1 : tensor<512x14366xf32>) -> tensor<512x14366xf32>",
+            "%3 = arith.truncf %2 : tensor<512x14366xf32> to tensor<512x14366xf16>",
+            "return %3 : tensor<512x14366xf16>",
+        ],
+    )
+
+
+def test_n_n_f16_f32_f16_dynamic_dim_M_N():
+    # Synthetic example (modified from test_n_t_f16_f32_f16_dynamic_dim_M)
+    cfg = GemmConfig(
+        M=kDynamic,
+        N=kDynamic,
+        K=4096,
+        tA="N",
+        tB="N",
+        operand_element_type="f16",
+        accumulator_element_type="f32",
+        result_element_type="f16",
+        runtime_dim=512,  # Unused, included for correctness
+    )
+    mlir = generate_mlir(cfg)
+    match_lines(
+        mlir,
+        [
+            "module {",
+            "func.func @main(%arg0: tensor<?x4096xf16>, %arg1: tensor<4096x?xf16>) -> tensor<?x?xf16> {",
+            "%cst = arith.constant 0.000000e+00 : f32",
+            "%c0 = arith.constant 0 : index",
+            "%dim = tensor.dim %arg0, %c0 : tensor<?x4096xf16>",
+            "%c1 = arith.constant 1 : index",
+            "%dim_0 = tensor.dim %arg1, %c1 : tensor<4096x?xf16>",
+            "%0 = tensor.empty(%dim, %dim_0) : tensor<?x?xf32>",
+            "%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x?xf32>) -> tensor<?x?xf32>",
+            "%2 = linalg.matmul ins(%arg0, %arg1 : tensor<?x4096xf16>, tensor<4096x?xf16>) outs(%1 : tensor<?x?xf32>) -> tensor<?x?xf32>",
+            "%3 = arith.truncf %2 : tensor<?x?xf32> to tensor<?x?xf16>",
+            "return %3 : tensor<?x?xf16>",
+        ],
+    )
+
+
 def test_n_t_bf16_f32_bf16():
     # From 'llama70bmemory'
     cfg = GemmConfig(