Skip to content

Commit c437c95

Browse files
authored
[CI][benchmarks] Longer warmup for flash attention and gemm_preop_exp (#5293)
Closes #5157
1 parent 0d2235f commit c437c95

File tree

2 files changed

+5
-8
lines changed

2 files changed

+5
-8
lines changed

benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -571,12 +571,11 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, MODE, provider):
571571
# This warmup logic improves performance on BMG significantly
572572
# For FWD mode in triton & cutlass: Some configs increase performance with warmup as a step function, but some slowly decrease with saturation
573573
# Performance is best at 250-400ms range, but we want stable, not just best at ~600ms (triton/cutlass providers)
574-
# n_warmup_fwd = 600
574+
n_warmup_fwd = 600
575575
# For BWD mode: Performance doesn't really improve much with warmup for triton, but xetla benefit from more warmup
576-
# n_warmup_bwd = 400 # Maximum across xetla=400, triton=10, onednn=10
577-
# n_warmup = n_warmup_fwd if MODE == 'fwd' else n_warmup_bwd
578-
# We keep old warmup value, because new warmup makes perfomance on PVC slightly worse
579-
do_bench = benchmark_suite.get_do_bench(n_warmup=10, n_repeat=10, quantiles=[0.5, 0.0, 1.0])
576+
n_warmup_bwd = 400 # Maximum across xetla=400, triton=10, onednn=10
577+
n_warmup = n_warmup_fwd if MODE == 'fwd' else n_warmup_bwd
578+
do_bench = benchmark_suite.get_do_bench(n_warmup=n_warmup, n_repeat=10, quantiles=[0.5, 0.0, 1.0])
580579
if MODE not in modes:
581580
raise AssertionError(f'Unknown {MODE}, supported modes are {modes}')
582581
dtype = torch.float16

benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -258,9 +258,7 @@ def benchmark(B, M, N, K, provider):
258258
# Some configs increase performance with warmup as a step function, but some slowly decrease with saturation.
259259
# Performance is best at 200-400ms range, but we want stable, not just best.
260260
# This warmup improves performance on BMG
261-
# n_warmup = 800
262-
# We keep old warmup for now because longer warmup make perfomance on PVC worse
263-
do_bench = benchmark_suite.get_do_bench(n_warmup=10, n_repeat=10, quantiles=[0.5, 0.0, 1.0])
261+
do_bench = benchmark_suite.get_do_bench(n_warmup=800, n_repeat=10, quantiles=[0.5, 0.0, 1.0])
264262
if B == 1:
265263
a = torch.rand((M, K), device='xpu', dtype=torch.bfloat16)
266264
b = torch.rand((K, N), device='xpu', dtype=torch.bfloat16)

0 commit comments

Comments
 (0)