[attention] Fix tuning and add flags (#18)

Groverkss · web-flow · commit 0aa226d137f8 · 2024-10-15T17:32:11.000+01:00
- Fix tuning spec
- Add flag for perf
- Add a dummy generation test to attention_utils
diff --git a/attentionbench/attention_utils.py b/attentionbench/attention_utils.py
@@ -58,6 +58,7 @@ def get_flops(self) -> int:
 @dataclass
 class TuningSpec:
     wg_tiles: list[int]
+    reduction_tiles: list[int]
     M_warp: int
     N_warp: int
     intrinsic: str
@@ -66,8 +67,11 @@ class TuningSpec:
 
     def get_lowering_config(self) -> str:
         return (
-            f"#iree_codegen.lowering_config<"
-            + f"tile_sizes = [[{','.join([str(x) for x in self.wg_tiles])}]]"
+            f"#iree_gpu.lowering_config<"
+            + "{ "
+            + f"workgroup = [{', '.join(map(str, self.wg_tiles))}], "
+            + f"reduction = [{', '.join(map(str, self.reduction_tiles))}]"
+            + " }"
             + f">"
         )
 
@@ -145,7 +149,7 @@ def generate_mlir(config: AttentionConfig, tuning: Optional[TuningSpec] = None):
 
 
 def get_attention_flags() -> list[str]:
-    return []
+    return ["--iree-codegen-gpu-native-math-precision"]
 
 
 def compile_attention_config(
@@ -157,7 +161,7 @@ def compile_attention_config(
 
     # TODO: Use different tuning specs for different configs. This is just a
     # general tuning config that worked well for sdxl shapes.
-    spec = TuningSpec([1, 128, 0, 0, 32], 4, 1, "MFMA_F32_32x32x8_F16", 2, True)
+    spec = TuningSpec([1, 128, 0, 0, 0], [0, 0, 0, 0, 32], 4, 1, "MFMA_F32_32x32x8_F16", 2, True)
     # Generate mlir content
     mlir_content = generate_mlir(config, spec)
 
@@ -196,3 +200,9 @@ def compile_attention_config(
         return mlir_file, None
 
     return mlir_file, vmfb_file
+
+# Dummy test generation
+if __name__ == "__main__":
+    config = AttentionConfig(20, 4096, 64, 64, 4096, "f16")
+    spec = TuningSpec([1, 128, 0, 0, 0], [0, 0, 0, 0, 32], 4, 1, "MFMA_F32_32x32x8_F16", 2, True)
+    print(generate_mlir(config, spec))