[CI] Better warmup for flex attention on B580 (#4906)

Egor-Krivov · web-flow · commit 3714e9b0cfcd · 2025-08-22T12:04:45.000+02:00
Flex attention requires more warmup steps on B580. PR adds: 1. Pre-warmup step for flex attention that is called once per run, so it will only run for the first shape config. Experiments show that first config requires more warmup 2. Makes GPU synch consistent between warmup and benchmarking 3. Adds iterations Should resolve #4852 Better warmup should be done after researching in #4911
diff --git a/benchmarks/triton_kernels_benchmark/__init__.py b/benchmarks/triton_kernels_benchmark/__init__.py
@@ -3,6 +3,7 @@
 from .benchmark_testing import (
     assert_close,
     do_bench,
+    do_prewarmup,
     filter_providers,
     perf_report,
     Benchmark,
@@ -19,6 +20,7 @@
 __all__ = [
     "assert_close",
     "do_bench",
+    "do_prewarmup",
     "filter_providers",
     "perf_report",
     "Benchmark",
diff --git a/benchmarks/triton_kernels_benchmark/benchmark_testing.py b/benchmarks/triton_kernels_benchmark/benchmark_testing.py
@@ -27,6 +27,7 @@
 BENCHMARKING_METHOD = os.getenv("BENCHMARKING_METHOD", "UPSTREAM_PYTORCH_PROFILER")
 BENCHMARKING_CONFIG = {
     "verify": os.getenv("VERIFY", "1") == "1",
+    "do_prewarmup": os.getenv("PREWARMUP", "1") == "1",
 }
 
 
@@ -41,6 +42,19 @@ def synchronize():
         torch.xpu.synchronize()
 
 
+def do_prewarmup(fn, min_seconds=5):
+    """Looks like some functions require pre-warmup with minimum time to do the compilation.
+    It has to be done once."""
+    if not BENCHMARKING_CONFIG["do_prewarmup"]:
+        return
+
+    start = time.time()
+    while time.time() - start < min_seconds:
+        fn()
+        synchronize()
+    BENCHMARKING_CONFIG["do_prewarmup"] = False
+
+
 def _summarize_statistics(times, quantiles, return_mode):
     if quantiles is not None:
         ret = torch.quantile(times, torch.tensor(quantiles, dtype=torch.float)).tolist()
@@ -139,6 +153,10 @@ def do_bench_upstream_pytorch_profiler(fn, n_warmup=25, n_repeat=100, grad_to_no
     # Warm-up
     for _ in range(n_warmup):
         fn()
+        # To be consistent with the benchmark measurements
+        if sync_submitting:
+            synchronize()
+
     # Benchmark
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.XPU]) as prof:
         for _ in range(n_repeat):
diff --git a/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_causal_mask.py b/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_causal_mask.py
@@ -165,7 +165,10 @@ def benchmark(Z, H_q, H_kv, N_CTX_q, N_CTX_kv, D_HEAD_qk, D_HEAD_v, MODE, provid
             triton_fn = lambda: triton_o.backward(triton_do, retain_graph=True)
 
         benchmark_suit.assert_close(triton_fn, torch_fn, atol=1e-2, rtol=1e-3, err_msg='triton to torch')
-        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10, quantiles=quantiles,
+
+        # Needs more warmup on B580 for some reason
+        benchmark_suit.do_prewarmup(triton_fn)
+        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=200, n_repeat=10, quantiles=quantiles,
                                                               device=DEVICE)
 
     elif provider == 'onednn':
diff --git a/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_custom_masks.py b/benchmarks/triton_kernels_benchmark/flex_attention_benchmark_custom_masks.py
@@ -112,7 +112,9 @@ def benchmark(Z, H, N_CTX, D_HEAD, MASK, MODE, provider):
             triton_o = triton_fn()
             triton_do = torch.randn_like(triton_o)
             triton_fn = lambda: triton_o.backward(triton_do, retain_graph=True)
-        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=5, n_repeat=5, quantiles=quantiles)
+        # Needs more warmup on B580 for some reason
+        benchmark_suit.do_prewarmup(triton_fn)
+        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=5, quantiles=quantiles)
         # Values checking cannot be implemented for these case as :
         # "The operator 'aten::_scaled_dot_product_flash_attention_for_cpu' is not currently implemented for the XPU device"