intel
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/runner-preparation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/runner-preparation.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/triton_kernels_benchmark/__init__.py‎
Lines changed: 0 additions & 2 deletions b/‎benchmarks/triton_kernels_benchmark/__init__.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/benchmark_testing.py‎
Lines changed: 24 additions & 22 deletions b/‎benchmarks/triton_kernels_benchmark/benchmark_testing.py‎
Lines changed: 24 additions & 22 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py‎
Lines changed: 18 additions & 4 deletions b/‎benchmarks/triton_kernels_benchmark/flash_attention_benchmark.py‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/flex_attention_benchmark_causal_mask.py‎
Lines changed: 4 additions & 4 deletions b/‎benchmarks/triton_kernels_benchmark/flex_attention_benchmark_causal_mask.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/flex_attention_benchmark_custom_masks.py‎
Lines changed: 5 additions & 4 deletions b/‎benchmarks/triton_kernels_benchmark/flex_attention_benchmark_custom_masks.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/fused_softmax.py‎
Lines changed: 8 additions & 5 deletions b/‎benchmarks/triton_kernels_benchmark/fused_softmax.py‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py‎
Lines changed: 6 additions & 4 deletions b/‎benchmarks/triton_kernels_benchmark/gemm_benchmark.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/gemm_postop_addmatrix_benchmark.py‎
Lines changed: 5 additions & 2 deletions b/‎benchmarks/triton_kernels_benchmark/gemm_postop_addmatrix_benchmark.py‎
Lines changed: 5 additions & 2 deletions
@@ -122,7 +122,9 @@ jobs:
           pytest --capture=tee-sys -rfs -n 8 python/test/gluon/
 
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
-          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice_concat_op.py
+          pytest --capture=tee-sys -rfs -n 8 third_party/amd/python/test/ \
+                --ignore=third_party/amd/python/test/test_scalarize_packed_fops.py \
+                --ignore=third_party/amd/python/test/test_address_sanitizer.py
           TRITON_ALWAYS_COMPILE=1 pytest --capture=tee-sys -rfs third_party/amd/python/test/test_scalarize_packed_fops.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 12 \
 
@@ -39,7 +39,7 @@ jobs:
       - name: Detect if build deps (e.g. LLVM hash) changed
         id: detect-change
         if: github.event_name == 'push'
-        uses: tj-actions/changed-files@v46
+        uses: tj-actions/changed-files@v47
         with:
           files: |
             cmake/*.txt
 
@@ -3,7 +3,6 @@
 from .benchmark_testing import (
     assert_close,
     do_bench,
-    do_prewarmup,
     filter_providers,
     perf_report,
     Benchmark,
@@ -20,7 +19,6 @@
 __all__ = [
     "assert_close",
     "do_bench",
-    "do_prewarmup",
     "filter_providers",
     "perf_report",
     "Benchmark",
 
@@ -27,7 +27,6 @@
 BENCHMARKING_METHOD = os.getenv("BENCHMARKING_METHOD", "UPSTREAM_PYTORCH_PROFILER")
 BENCHMARKING_CONFIG = {
     "verify": os.getenv("VERIFY", "1") == "1",
-    "do_prewarmup": os.getenv("PREWARMUP", "1") == "1",
 }
 
 
@@ -42,19 +41,6 @@ def synchronize():
         torch.xpu.synchronize()
 
 
-def do_prewarmup(fn, min_seconds=5):
-    """Looks like some functions require pre-warmup with minimum time to do the compilation.
-    It has to be done once."""
-    if not BENCHMARKING_CONFIG["do_prewarmup"]:
-        return
-
-    start = time.time()
-    while time.time() - start < min_seconds:
-        fn()
-        synchronize()
-    BENCHMARKING_CONFIG["do_prewarmup"] = False
-
-
 def _summarize_statistics(times, quantiles, return_mode):
     if quantiles is not None:
         ret = torch.quantile(times, torch.tensor(quantiles, dtype=torch.float)).tolist()
@@ -73,7 +59,7 @@ def _summarize_statistics(times, quantiles, return_mode):
 
 
 def do_bench_elapsed_time(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quantiles=None, return_mode="mean",
-                          device="xpu"):
+                          device="xpu", time_warmup=False):
     """
     Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
     the 20-th and 80-th performance percentile.
@@ -113,16 +99,20 @@ def do_bench_elapsed_time(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quan
     del cache
 
     # compute warmup and repeat times
-    warmup_time = n_warmup * estimate_ms
+    if time_warmup:
+        warmup_ms = n_warmup
+    else:
+        warmup_ms = n_warmup * estimate_ms
     rep_time = n_repeat * estimate_ms
 
-    times = triton_do_bench(fn, warmup=warmup_time, rep=rep_time, grad_to_none=grad_to_none, return_mode="all")
+    times = triton_do_bench(fn, warmup=warmup_ms, rep=rep_time, grad_to_none=grad_to_none, return_mode="all")
     times = torch.tensor(times, dtype=torch.float)
     return _summarize_statistics(times, quantiles, return_mode)
 
 
 def do_bench_upstream_pytorch_profiler(fn, n_warmup=25, n_repeat=100, grad_to_none=None, quantiles=None,
-                                       return_mode="mean", device="xpu", sync_submitting=True, benchmark_label=None):
+                                       return_mode="mean", device="xpu", sync_submitting=True, time_warmup=True,
+                                       benchmark_label=None, max_iters=1500):
     """
     Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
     the 20-th and 80-th performance percentile.
@@ -151,11 +141,23 @@ def do_bench_upstream_pytorch_profiler(fn, n_warmup=25, n_repeat=100, grad_to_no
     cache = torch.empty(int(cache_size // 4), dtype=torch.int, device=device)
 
     # Warm-up
-    for _ in range(n_warmup):
-        fn()
-        # To be consistent with the benchmark measurements
-        if sync_submitting:
+    if time_warmup:
+        # Stop either on max iteration number or max time
+        warmup_time_s = n_warmup / 1000
+        assert sync_submitting
+        start = time.perf_counter()
+        i = 0
+        while i < max_iters and time.perf_counter() - start < warmup_time_s:
+            fn()
             synchronize()
+            i += 1
+        print(f"Stopped warmup after {i} iterations")
+    else:
+        for _ in range(n_warmup):
+            fn()
+            # To be consistent with the benchmark measurements
+            if sync_submitting:
+                synchronize()
 
     # Benchmark
     with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.XPU]) as prof:
 
@@ -575,6 +575,15 @@ def get_benchmark(
     # pylint: disable=too-many-branches
     def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, MODE, provider):
         modes = ['fwd', 'bwd']
+        # This warmup logic improves performance on BMG significantly
+        # For FWD mode in triton & cutlass: Some configs increase performance with warmup as a step function, but some slowly decrease with saturation
+        # Performance is best at 250-400ms range, but we want stable, not just best at ~600ms (triton/cutlass providers)
+        n_warmup_fwd = 600
+        # For BWD mode: Performance doesn't really improve much with warmup for triton, but xetla benefit from more warmup
+        n_warmup_bwd = 400  # Maximum across xetla=400, triton=10, onednn=10
+        n_warmup = n_warmup_fwd if MODE == 'fwd' else n_warmup_bwd
+        # We keep old warmup value, because new warmup makes perfomance on PVC slightly worse
+        n_warmup = 10
         if MODE not in modes:
             raise AssertionError(f'Unknown {MODE}, supported modes are {modes}')
         dtype = torch.float16
@@ -602,9 +611,10 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, MODE, provider):
         if provider == 'onednn':
             _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(
                 torch_fn,
-                n_warmup=10,
+                n_warmup=n_warmup,
                 n_repeat=10,
                 quantiles=quantiles,
+                time_warmup=False,
             )
 
         elif provider == 'triton':
@@ -623,11 +633,13 @@ def benchmark(Z, H, N_CTX, D_HEAD, CAUSAL, MODE, provider):
                     rtol=0,
                     err_msg='triton to torch',
                 )
+
             _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(
                 triton_fn,
-                n_warmup=10,
+                n_warmup=n_warmup,
                 n_repeat=10,
                 quantiles=quantiles,
+                time_warmup=False,
             )
 
         elif provider == 'xetla':
@@ -660,9 +672,10 @@ def xetla_bwd_fn():
 
                 _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(
                     xetla_bwd_fn,
-                    n_warmup=10,
+                    n_warmup=n_warmup,
                     n_repeat=10,
                     quantiles=quantiles,
+                    time_warmup=False,
                 )
 
             else:
@@ -685,9 +698,10 @@ def cutlass_fwd_fn():
 
                 _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(
                     cutlass_fwd_fn,
-                    n_warmup=10,
+                    n_warmup=n_warmup,
                     n_repeat=10,
                     quantiles=quantiles,
+                    time_warmup=False,
                 )
 
             else:
 
@@ -137,6 +137,8 @@ def causal_mask(_, __, q_idx, kv_idx):
         args={},
     ))
 def benchmark(Z, H_q, H_kv, N_CTX_q, N_CTX_kv, D_HEAD_qk, D_HEAD_v, MODE, provider):
+    # Maximum across torch=200, triton=600
+    n_warmup = 600
     if MODE not in ('fwd', 'bwd'):
         raise ValueError(f"Invalid MODE: {MODE}. Expected 'fwd' or 'bwd'.")
     dtype = torch.float16
@@ -156,7 +158,7 @@ def benchmark(Z, H_q, H_kv, N_CTX_q, N_CTX_kv, D_HEAD_qk, D_HEAD_v, MODE, provid
             mean = float('nan')
             cv = float('nan')
         else:
-            _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(torch_fn, n_warmup=10, n_repeat=10,
+            _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(torch_fn, n_warmup=n_warmup, n_repeat=10,
                                                                   quantiles=quantiles, device=DEVICE)
 
     elif provider == 'triton':
@@ -181,10 +183,8 @@ def benchmark(Z, H_q, H_kv, N_CTX_q, N_CTX_kv, D_HEAD_qk, D_HEAD_v, MODE, provid
         else:
             benchmark_suit.assert_close(triton_fn, torch_fn, atol=1e-2, rtol=1e-3, err_msg='triton to torch')
 
-        # Needs more warmup on B580 for some reason
-        benchmark_suit.do_prewarmup(triton_fn)
         _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(
-            triton_fn, n_warmup=200, n_repeat=10, quantiles=quantiles, device=DEVICE, grad_to_none=(q, k, v),
+            triton_fn, n_warmup=n_warmup, n_repeat=10, quantiles=quantiles, device=DEVICE, grad_to_none=(q, k, v),
             benchmark_label=None if MODE == 'fwd' else 'CompiledFunctionBackward')
 
     else:
 
@@ -82,6 +82,8 @@ def alibi_functional(score, _, h, q_idx, kv_idx):
         args={},
     ))
 def benchmark(Z, H, N_CTX, D_HEAD, MASK, MODE, provider):
+    # There is still performance variance for triton, probably caused by random choice of autotune config
+    n_warmup = 200
     assert MODE in ['fwd', 'bwd']
     assert MASK in ['NATTEN', 'Alibi']
     dtype = torch.float16
@@ -112,9 +114,8 @@ def benchmark(Z, H, N_CTX, D_HEAD, MASK, MODE, provider):
             triton_o = triton_fn()
             triton_do = torch.randn_like(triton_o)
             triton_fn = lambda: triton_o.backward(triton_do, retain_graph=True)
-        # Needs more warmup on B580 for some reason
-        benchmark_suit.do_prewarmup(triton_fn)
-        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=5, quantiles=quantiles)
+        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(triton_fn, n_warmup=n_warmup, n_repeat=10,
+                                                              quantiles=quantiles)
         # Values checking cannot be implemented for these case as :
         # "The operator 'aten::_scaled_dot_product_flash_attention_for_cpu' is not currently implemented for the XPU device"
 
@@ -124,7 +125,7 @@ def benchmark(Z, H, N_CTX, D_HEAD, MASK, MODE, provider):
             xformers_o = xformers_fn()
             xformers_do = torch.randn_like(xformers_o)
             xformers_fn = lambda: xformers_o.backward(xformers_do, retain_graph=True)
-        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xformers_fn, n_warmup=10, n_repeat=10,
+        _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xformers_fn, n_warmup=n_warmup, n_repeat=10,
                                                               quantiles=quantiles)
 
     else:
 
@@ -128,13 +128,16 @@ def get_benchmark(providers_filter: Optional[list[str]] = None):
             args={"M": 4096},  # values for function arguments not in `x_names` and `y_name`
         ))
     def benchmark(M, N, provider):
+        # Maximum across torch-native=10, triton=800, torch-jit=10, xetla=100, onednn=800
+        # For onednn more warmup very slowly makes performance worse
+        n_warmup = 800
         x = torch.randn(M, N, device="xpu", dtype=torch.bfloat16)
         quantiles = [0.5, 0.0, 1.0]
         if provider == "torch-native":
             _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(
                 lambda: torch.softmax(x, axis=-1),
                 quantiles=quantiles,
-                n_warmup=10,
+                n_warmup=n_warmup,
                 n_repeat=10,
             )
         if provider == "triton":
@@ -145,13 +148,13 @@ def benchmark(M, N, provider):
             _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(
                 triton_fn,
                 quantiles=quantiles,
-                n_warmup=10,
+                n_warmup=n_warmup,
                 n_repeat=10,
             )
 
         elif provider == "torch-jit":
             _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(lambda: naive_softmax(x), quantiles=quantiles,
-                                                                   n_warmup=10, n_repeat=10)
+                                                                   n_warmup=n_warmup, n_repeat=10)
 
         elif provider == "xetla":
             name = f"softmax_shape_{M}_{N}"
@@ -160,7 +163,7 @@ def benchmark(M, N, provider):
             xetla_fn = lambda: func(x, out, 0)
             torch_fn = lambda: torch.softmax(x, axis=-1)
             # benchmark_suite.assert_close(xetla_fn, torch_fn, err_msg="xetla to torch")
-            _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(xetla_fn, quantiles=quantiles, n_warmup=10,
+            _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(xetla_fn, quantiles=quantiles, n_warmup=n_warmup,
                                                                    n_repeat=10)
 
         elif provider == "onednn":
@@ -170,7 +173,7 @@ def benchmark(M, N, provider):
             onednn_fn = lambda: func(M, N, x, out, 1)
             torch_fn = lambda: torch.softmax(x, axis=-1)
             benchmark_suite.assert_close(onednn_fn, torch_fn, err_msg="onednn to torch")
-            _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(onednn_fn, quantiles=quantiles, n_warmup=10,
+            _, min_ms, max_ms, mean, cv = benchmark_suite.do_bench(onednn_fn, quantiles=quantiles, n_warmup=n_warmup,
                                                                    n_repeat=10)
 
         else:
 
@@ -340,6 +340,8 @@ def get_benchmark(
             args={},
         ))
     def benchmark(B, M, N, K, provider):
+        # Maximum across onednn=600, triton=800, xetla=10, cutlass=600
+        n_warmup = 800
         a_shape, b_shape = get_shapes(B, M, N, K, transpose_a=transpose_a, transpose_b=transpose_b)
 
         torch.manual_seed(0)
@@ -359,7 +361,7 @@ def benchmark(B, M, N, K, provider):
         if provider == 'onednn':
             _, min_ms, max_ms, mean_ms, cv = benchmark_suite.do_bench(
                 lambda: torch.matmul(torch_a, torch_b),
-                n_warmup=10,
+                n_warmup=n_warmup,
                 n_repeat=10,
                 quantiles=quantiles,
             )
@@ -387,7 +389,7 @@ def benchmark(B, M, N, K, provider):
             benchmark_suite.assert_close(triton_fn, torch_fn, atol=1e-4, rtol=rtol, err_msg='triton to torch')
             _, min_ms, max_ms, mean_ms, cv = benchmark_suite.do_bench(
                 triton_fn,
-                n_warmup=10,
+                n_warmup=n_warmup,
                 n_repeat=10,
                 quantiles=quantiles,
             )
@@ -421,7 +423,7 @@ def xetla_func_with_acc_allocation():
             # benchmark_suite.assert_close(xetla_fn, torch_fn, atol=1e-4, rtol=1.0, err_msg='xetla to torch')
             _, min_ms, max_ms, mean_ms, cv = benchmark_suite.do_bench(
                 xetla_fn,
-                n_warmup=10,
+                n_warmup=n_warmup,
                 n_repeat=10,
                 quantiles=quantiles,
             )
@@ -452,7 +454,7 @@ def cutlass_invoker():
             benchmark_suite.assert_close(cutlass_fn, torch_fn, atol=1e-4, rtol=rtol, err_msg='cutlass to torch')
             _, min_ms, max_ms, mean_ms, cv = benchmark_suite.do_bench(
                 cutlass_fn,
-                n_warmup=10,
+                n_warmup=n_warmup,
                 n_repeat=10,
                 quantiles=quantiles,
             )
 
@@ -315,6 +315,9 @@ def is_enough_memory(x_val):
         args={},
     ))
 def benchmark(B, M, N, K, dtype, provider):
+    # Maximum across onednn=600, triton=1000
+    # For onednn and triton: Some configs increase performance with warmup as a step function, but some slowly decrease with saturation. Performance is best at 150-200ms range, but we want stable, not just best
+    n_warmup = 1000
     res_dtype = torch.float32 if dtype.is_floating_point else torch.int32
     if dtype.is_floating_point:
         rand = lambda shape, dtype: torch.rand(shape, device='xpu', dtype=dtype)
@@ -332,7 +335,7 @@ def benchmark(B, M, N, K, dtype, provider):
     quantiles = [0.5, 0.0, 1.0]
 
     if provider == 'onednn':
-        _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(lambda: torch.matmul(a, b) + d, n_warmup=10,
+        _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(lambda: torch.matmul(a, b) + d, n_warmup=n_warmup,
                                                                  n_repeat=10, quantiles=quantiles)
     elif provider == 'triton':
         assert len(a.shape) == len(b.shape), 'Incompatible sizes'
@@ -353,7 +356,7 @@ def benchmark(B, M, N, K, dtype, provider):
                                                        [1, 512, 8192, 32768], [4, 32768, 4096, 128]]:
             # torch int8 matmul on GPU is not supported. only check a few int8 shapes to reduce runtime
             benchmark_suit.assert_close(triton_fn, torch_fn, atol=1e-4, rtol=rtol, err_msg='triton to torch')
-        _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=10, n_repeat=10,
+        _, min_ms, max_ms, mean_ms, cv = benchmark_suit.do_bench(triton_fn, n_warmup=n_warmup, n_repeat=10,
                                                                  quantiles=quantiles)
     else:
         raise NotImplementedError(f'Unsupported provider {provider}')