[CI][benchmarks] Longer warmup for flash attention and gemm_preop_exp (#5293)

Egor-Krivov · web-flow · commit c437c9514ccc · 2025-10-15T23:16:14.000-04:00
Closes #5157
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py
@@ -571,12 +571,11 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, MODE, provider):
         # This warmup logic improves performance on BMG significantly
         # For FWD mode in triton & cutlass: Some configs increase performance with warmup as a step function, but some slowly decrease with saturation
         # Performance is best at 250-400ms range, but we want stable, not just best at ~600ms (triton/cutlass providers)
-        # n_warmup_fwd = 600
+        n_warmup_fwd = 600
         # For BWD mode: Performance doesn't really improve much with warmup for triton, but xetla benefit from more warmup
-        # n_warmup_bwd = 400  # Maximum across xetla=400, triton=10, onednn=10
-        # n_warmup = n_warmup_fwd if MODE == 'fwd' else n_warmup_bwd
-        # We keep old warmup value, because new warmup makes perfomance on PVC slightly worse
-        do_bench = benchmark_suite.get_do_bench(n_warmup=10, n_repeat=10, quantiles=[0.5, 0.0, 1.0])
+        n_warmup_bwd = 400  # Maximum across xetla=400, triton=10, onednn=10
+        n_warmup = n_warmup_fwd if MODE == 'fwd' else n_warmup_bwd
+        do_bench = benchmark_suite.get_do_bench(n_warmup=n_warmup, n_repeat=10, quantiles=[0.5, 0.0, 1.0])
         if MODE not in modes:
             raise AssertionError(f'Unknown {MODE}, supported modes are {modes}')
         dtype = torch.float16
diff --git a/benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_preop_exp_benchmark.py
@@ -258,9 +258,7 @@ def benchmark(B, M, N, K, provider):
     # Some configs increase performance with warmup as a step function, but some slowly decrease with saturation.
     # Performance is best at 200-400ms range, but we want stable, not just best.
     # This warmup improves performance on BMG
-    # n_warmup = 800
-    # We keep old warmup for now because longer warmup make perfomance on PVC worse
-    do_bench = benchmark_suite.get_do_bench(n_warmup=10, n_repeat=10, quantiles=[0.5, 0.0, 1.0])
+    do_bench = benchmark_suite.get_do_bench(n_warmup=800, n_repeat=10, quantiles=[0.5, 0.0, 1.0])
     if B == 1:
         a = torch.rand((M, K), device='xpu', dtype=torch.bfloat16)
         b = torch.rand((K, N), device='xpu', dtype=torch.bfloat16)