iree-org
diff --git a/‎tests/conftest.py‎
Lines changed: 21 additions & 0 deletions b/‎tests/conftest.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎tests/kernel/wave/attention/alibi_attention_test.py‎
Lines changed: 1 addition & 7 deletions b/‎tests/kernel/wave/attention/alibi_attention_test.py‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎tests/kernel/wave/attention/backward_attention_test.py‎
Lines changed: 5 additions & 43 deletions b/‎tests/kernel/wave/attention/backward_attention_test.py‎
Lines changed: 5 additions & 43 deletions
diff --git a/‎tests/kernel/wave/attention/chained_gemm_test.py‎
Lines changed: 2 additions & 19 deletions b/‎tests/kernel/wave/attention/chained_gemm_test.py‎
Lines changed: 2 additions & 19 deletions
@@ -53,6 +53,21 @@ def perf_filename_iree(dump_perf_path, request):
     return os.path.join(dump_perf_path, "iree_" + request.node.name + ".json")
 
 
+@pytest.fixture(scope="function", autouse=True)
+def set_mlir_filename(request):
+    option = request.config.getoption("--dump-mlir-files-path")
+    if not option:
+        return
+
+    import iree.turbine.kernel.wave.utils.run_utils as run_utils
+
+    run_utils.dump_generated_mlir = True
+    run_utils.dump_generated_mlir_file = os.path.join(
+        option,
+        "mlir_" + request.node.name + ".mlir",
+    )
+
+
 def pytest_addoption(parser):
     parser.addoption(
         "--run-e2e", action="store_true", default=False, help="run e2e tests"
@@ -78,6 +93,12 @@ def pytest_addoption(parser):
         default=0,
         help="Distribute over N gpu devices when running with pytest-xdist",
     )
+    parser.addoption(
+        "--dump-mlir-files-path",
+        action="store",
+        default=None,
+        help="save mlir files into provided directory, filename based on current test name",
+    )
 
 
 def pytest_configure(config):
 
@@ -16,8 +16,6 @@
     set_default_run_config,
 )
 from wave_lang.kernel.wave.utils.torch_utils import (
-    device_arange,
-    device_full,
     device_randn,
     device_zeros,
     to_default_device,
@@ -30,14 +28,11 @@
     AttentionShape,
 )
 from wave_lang.kernel.wave.compile import WaveCompileOptions, wave_compile
-import os
 from torch.testing import assert_close
 from ..common.utils import (
     require_e2e,
-    enable_scheduling_barriers,
 )
-from ..common.shapes import get_test_shapes
-from typing import List, Optional, Tuple
+from typing import Optional, Tuple
 
 shapes = [(128, 128, 128, 128, 128, 128)]
 
@@ -144,7 +139,6 @@ def test_alibi_attention(
         subs=hyperparams,
         canonicalize=True,
         run_bench=run_bench,
-        use_scheduling_barriers=enable_scheduling_barriers,
         benchmark_batch_size=10,
         benchmark_repetitions=3,
         benchmark_results_file=perf_filename_tk,
 
@@ -8,7 +8,6 @@
 import torch
 from torch.nn import functional as F
 import math
-import wave_lang.kernel as tk
 import wave_lang.kernel.lang as tkl
 import wave_lang.kernel.wave as tkw
 from wave_lang.kernel.lang.global_symbols import *
@@ -29,8 +28,6 @@
 from wave_lang.kernel.wave.compile import WaveCompileOptions, wave_compile
 from wave_lang.kernel.wave.constraints import MMAType
 from ..common.utils import (
-    dump_generated_mlir,
-    enable_scheduling_barriers,
     expensive_test,
     require_e2e,
 )
@@ -1142,7 +1139,6 @@ def testAttentionForward(mfma_variant: MMAType, shape: tuple[int, ...]):
     hyperparams.update(get_default_scheduling_params())
     options = WaveCompileOptions(
         subs=hyperparams,
-        use_scheduling_barriers=enable_scheduling_barriers,
         run_bench=False,
         waves_per_eu=2,
         denorm_fp_math_f32="preserve-sign",
@@ -1154,13 +1150,7 @@ def testAttentionForward(mfma_variant: MMAType, shape: tuple[int, ...]):
     lse = device_zeros(batch, q_seq_len, dtype=torch.float16)
     s = device_zeros(batch, q_seq_len, kv_seq_len)
 
-    asm_fwd = attention_fwd(q, k, v.transpose(-1, -2), s, o, lse)
-
-    if dump_generated_mlir:
-        filename = f"out/wave_attention_fwd_{'x'.join(map(str, shape))}.mlir"
-        with open(filename, "w") as f:
-            f.write(asm_fwd)
-        print(f"IR dumped to {filename}")
+    attention_fwd(q, k, v.transpose(-1, -2), s, o, lse)
 
     assert_close(s, s_ref, **cmp_params)
     # Can't check P, since we don't actually compute the "real" thing in the
@@ -1209,7 +1199,6 @@ def testAttentionBackward(mfma_variant: MMAType, shape: tuple[int, ...]):
     hyperparams.update(get_default_scheduling_params())
     options = WaveCompileOptions(
         subs=hyperparams,
-        use_scheduling_barriers=enable_scheduling_barriers,
         run_bench=False,
         waves_per_eu=2,
         denorm_fp_math_f32="preserve-sign",
@@ -1229,7 +1218,7 @@ def testAttentionBackward(mfma_variant: MMAType, shape: tuple[int, ...]):
     dp = device_zeros(batch, q_seq_len, kv_seq_len, dtype=torch.float32)
     dp_sub = device_zeros(batch, q_seq_len, kv_seq_len, dtype=torch.float16)
 
-    asm_bwd = attention_bwd(
+    attention_bwd(
         q,
         k,
         v,
@@ -1247,12 +1236,6 @@ def testAttentionBackward(mfma_variant: MMAType, shape: tuple[int, ...]):
         dp_sub,
     )
 
-    if dump_generated_mlir:
-        filename = f"out/wave_attention_bwd_{'x'.join(map(str, shape))}.mlir"
-        with open(filename, "w") as f:
-            f.write(asm_bwd)
-        print(f"IR dumped to {filename}")
-
     assert_close(s, s_ref, **cmp_params)
     assert_close(p, p_ref, **cmp_params)
 
@@ -1305,7 +1288,6 @@ def testAttentionBackward_dv(mfma_variant: MMAType, shape: tuple[int, ...]):
     hyperparams_dv.update(get_default_scheduling_params())
     options = WaveCompileOptions(
         subs=hyperparams_dv,
-        use_scheduling_barriers=enable_scheduling_barriers,
         run_bench=False,
         waves_per_eu=2,
         denorm_fp_math_f32="preserve-sign",
@@ -1317,13 +1299,7 @@ def testAttentionBackward_dv(mfma_variant: MMAType, shape: tuple[int, ...]):
     s = device_zeros(batch, q_seq_len, kv_seq_len, dtype=torch.float32)
     p = device_zeros(batch, q_seq_len, kv_seq_len, dtype=torch.float16)
 
-    asm_bwd_dv = attention_bwd_dv(q, k, do, lse_ref, dv, s, p)
-
-    if dump_generated_mlir:
-        filename = f"out/wave_attention_bwd_dv_{'x'.join(map(str, shape))}.mlir"
-        with open(filename, "w") as f:
-            f.write(asm_bwd_dv)
-        print(f"IR dumped to {filename}")
+    attention_bwd_dv(q, k, do, lse_ref, dv, s, p)
 
     assert_close(s, s_ref, **cmp_params)
     assert_close(p, p_ref, **cmp_params)
@@ -1367,7 +1343,6 @@ def testAttentionBackward_dk(mfma_variant: MMAType, shape: tuple[int, ...]):
     hyperparams_dk.update(get_default_scheduling_params())
     options = WaveCompileOptions(
         subs=hyperparams_dk,
-        use_scheduling_barriers=enable_scheduling_barriers,
         run_bench=False,
         waves_per_eu=2,
         denorm_fp_math_f32="preserve-sign",
@@ -1383,7 +1358,7 @@ def testAttentionBackward_dk(mfma_variant: MMAType, shape: tuple[int, ...]):
     dp = torch.zeros_like(s)
     dp_sub = torch.zeros_like(p)
 
-    asm_bwd_dk = attention_bwd_dk(
+    attention_bwd_dk(
         q,
         k,
         v,
@@ -1398,12 +1373,6 @@ def testAttentionBackward_dk(mfma_variant: MMAType, shape: tuple[int, ...]):
         dp_sub,
     )
 
-    if dump_generated_mlir:
-        filename = f"out/wave_attention_bwd_dk_{'x'.join(map(str, shape))}.mlir"
-        with open(filename, "w") as f:
-            f.write(asm_bwd_dk)
-        print(f"IR dumped to {filename}")
-
     dp_sub_ref = (dp_ref - D.reshape((batch, q_seq_len, 1))).to(torch.float16)
 
     assert_close(s, s_ref, **cmp_params)
@@ -1452,7 +1421,6 @@ def testAttentionBackward_dq(mfma_variant: MMAType, shape: tuple[int, ...]):
     hyperparams_dq.update(get_default_scheduling_params())
     options = WaveCompileOptions(
         subs=hyperparams_dq,
-        use_scheduling_barriers=enable_scheduling_barriers,
         run_bench=False,
         waves_per_eu=2,
         denorm_fp_math_f32="preserve-sign",
@@ -1469,7 +1437,7 @@ def testAttentionBackward_dq(mfma_variant: MMAType, shape: tuple[int, ...]):
     dp = torch.zeros_like(s)
     dp_sub = torch.zeros_like(p)
 
-    asm_bwd_dq = attention_bwd_dq(
+    attention_bwd_dq(
         q,
         k,
         v,
@@ -1485,12 +1453,6 @@ def testAttentionBackward_dq(mfma_variant: MMAType, shape: tuple[int, ...]):
         dp_sub,
     )
 
-    if dump_generated_mlir:
-        filename = f"out/wave_attention_bwd_dq_{'x'.join(map(str, shape))}.mlir"
-        with open(filename, "w") as f:
-            f.write(asm_bwd_dq)
-        print(f"IR dumped to {filename}")
-
     s_sub_ref = s_ref.to(torch.float16) - lse_ref.reshape((batch, q_seq_len, 1)).expand(
         batch, q_seq_len, kv_seq_len
     )
 
@@ -6,7 +6,6 @@
 
 import pytest
 import torch
-import wave_lang.kernel as tk
 import wave_lang.kernel.lang as tkl
 import wave_lang.kernel.wave as tkw
 from wave_lang.kernel.lang.global_symbols import *
@@ -27,14 +26,11 @@
 )
 from wave_lang.kernel.wave.compile import WaveCompileOptions, wave_compile
 from wave_lang.kernel.wave.constraints import MMAType
-import os
 from torch.testing import assert_close
 from ..common.utils import (
     require_e2e,
     require_cdna3,
     param_bool,
-    enable_scheduling_barriers,
-    dump_generated_mlir,
 )
 from ..common.shapes import get_test_shapes
 
@@ -145,7 +141,6 @@ def repeat(
         subs=hyperparams,
         canonicalize=True,
         run_bench=run_bench,
-        use_scheduling_barriers=enable_scheduling_barriers,
         benchmark_batch_size=10,
         benchmark_repetitions=3,
         benchmark_results_file=perf_filename_tk,
@@ -157,13 +152,7 @@ def repeat(
     k = device_randn(batch, kv_seq_len, qk_head_dim, dtype=torch.float16)
     v = device_randn(batch, v_head_dim, kv_seq_len, dtype=torch.float16)
     output = device_zeros(batch, v_head_dim, q_seq_len, dtype=torch.float32)
-    asm = chained_gemm(q, k, v, output)
-
-    if dump_generated_mlir:
-        filename = f"wave_cgemm_{'x'.join(map(str, shape))}.mlir"
-        with open(filename, "w") as f:
-            f.write(asm)
-            print(f"IR dumped to {filename}")
+    chained_gemm(q, k, v, output)
 
     iree_ref = device_zeros(batch, v_head_dim, q_seq_len, dtype=torch.float32)
     generate_iree_ref("chain_mmt", [q, k, v], [iree_ref], options)
@@ -291,7 +280,6 @@ def repeat(
         subs=hyperparams,
         canonicalize=True,
         run_bench=run_bench,
-        use_scheduling_barriers=enable_scheduling_barriers,
         benchmark_batch_size=10,
         benchmark_repetitions=3,
         benchmark_results_file=perf_filename_tk,
@@ -303,12 +291,7 @@ def repeat(
     k = device_randn(batch, kv_seq_len, qk_head_dim, dtype=torch.float16)
     v = device_randn(batch, v_head_dim, kv_seq_len, dtype=torch.float16)
     output = device_zeros(batch, v_head_dim, q_seq_len, dtype=torch.float32)
-    asm = chained_gemm_f8(q, k, v, output)
-
-    if dump_generated_mlir:
-        filename = f"wave_cgemm_{'x'.join(map(str, shape))}.mlir"
-        with open(filename, "w") as f:
-            f.write(asm)
+    chained_gemm_f8(q, k, v, output)
 
     iree_ref = device_zeros(batch, v_head_dim, q_seq_len, dtype=torch.float32)
     generate_iree_ref("chain_mmt_f8", [q, k, v], [iree_ref], options)