intel
diff --git a/‎benchmarks/setup.py‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/setup.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/tests/test_entry_point.py‎
Lines changed: 6 additions & 2 deletions b/‎benchmarks/tests/test_entry_point.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎benchmarks/tests/test_mocks.py‎
Lines changed: 67 additions & 0 deletions b/‎benchmarks/tests/test_mocks.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎benchmarks/triton_kernels_benchmark/benchmark_config_templates.py‎
Lines changed: 21 additions & 3 deletions b/‎benchmarks/triton_kernels_benchmark/benchmark_config_templates.py‎
Lines changed: 21 additions & 3 deletions
@@ -116,6 +116,7 @@ def get_git_commit_hash(length=8):
     install_requires=[
         "torch>=2.6",
         "pandas",
+        "scipy",
         "psutil",
         "tabulate",
         "matplotlib",
 
@@ -3,9 +3,11 @@
 import pytest
 
 from triton_kernels_benchmark.benchmark_testing import MarkArgs
+from triton_kernels_benchmark.becnhmark_config_templates import CONFIGS
 from triton_kernels_benchmark.benchmark_utils import BenchmarkCategory, BenchmarkConfigs
 
 ALL_CATEGORIES = {cat.value for cat in BenchmarkCategory}
+ALL_CONFIGS = {config.key: config for config in CONFIGS}
 
 
 @pytest.mark.parametrize(
@@ -20,7 +22,7 @@
         "providers_count",
     ),
     (
-        [True, set(), True, ALL_CATEGORIES, [], None, lambda x: x > 1, lambda x: x > 1],
+        [True, ALL_CONFIGS, True, ALL_CATEGORIES, [], None, lambda x: x > 1, lambda x: x > 1],
         [True, {"softmax", "gemm"}, True, ALL_CATEGORIES, [], None, lambda x: x > 1, lambda x: x > 1],
         [True, {"softmax", "gemm"}, True, {"core", "gemm", "softmax"}, [], None, lambda x: x > 1, lambda x: x > 1],
         [False, {"softmax"}, False, {"optional"}, ["triton"], AssertionError, None, None],
@@ -47,6 +49,8 @@ def benchmark_configs():
             select_all=select_all,
             categories_filter=categories_filter,
             providers_filter=providers_filter,
+            json_output=False,
+            detailed_output=False,
             tag="",
         )
 
@@ -57,6 +61,6 @@ def benchmark_configs():
         configs = benchmark_configs().configs
         benchmark_configs().run()
         assert configs_count(len(configs))
-        providers_counts = [len(config.config_summary.selected_providers) for config in configs]
+        providers_counts = [len(config.selected_providers) for config in configs]
         assert providers_count(max(providers_counts))
         assert providers_count(min(providers_counts))
@@ -0,0 +1,67 @@
+from typing import Optional
+
+import re
+
+import io
+
+import pytest
+
+import pandas as pd
+
+from triton_kernels_benchmark.benchmark_utils import BenchmarkCategory, BenchmarkConfigs
+
+ALL_CATEGORIES = {cat.value for cat in BenchmarkCategory}
+
+SOFTMAX_PERFORMANCE_CSV = """
+N,Triton-GB/s,XeTLA-GB/s,Triton-GB/s-min,XeTLA-GB/s-min,Triton-GB/s-max,XeTLA-GB/s-max,Triton-TFlops,XeTLA-TFlops,Triton-TFlops-min,XeTLA-TFlops-min,Triton-TFlops-max,XeTLA-TFlops-max,Triton-CV,XeTLA-CV,datetime,run_counter
+256.000000,473.397771,568.333815,90.083848,514.007860,494.611303,582.542232,0.473398,0.568334,0.090084,0.514008,0.494611,0.582542,0.019154,0.018093,2025-05-05 21:45:29.943213,1
+1024.000000,683.111432,541.549931,672.164101,537.731297,689.852609,548.992673,0.683111,0.541550,0.672164,0.537731,0.689853,0.548993,0.006031,0.004731,2025-05-05 21:45:29.943213,1
+2048.000000,677.320009,726.915809,672.164101,708.497308,683.111380,825.650389,0.677320,0.726916,0.672164,0.708497,0.683111,0.825650,0.003426,0.018620,2025-05-05 21:45:29.943213,1
+4096.000000,627.302921,477.032066,616.809404,474.468764,641.330889,488.846612,0.627303,0.477032,0.616809,0.474469,0.641331,0.488847,0.008189,0.003735,2025-05-05 21:45:29.943213,1
+8192.000000,679.033333,611.916382,665.234595,604.802311,762.600731,637.916958,0.679033,0.611916,0.665235,0.604802,0.762601,0.637917,0.016350,0.008740,2025-05-05 21:45:29.943213,1
+16384.000000,712.219329,677.833087,703.447226,661.562161,760.871449,688.437266,0.712219,0.677833,0.703447,0.661562,0.760871,0.688437,0.009147,0.009317,2025-05-05 21:45:29.943213,1
+32768.000000,733.450281,729.424324,727.861837,726.286411,756.582488,737.136026,0.733450,0.729424,0.727862,0.726286,0.756582,0.737136,0.003869,0.002001,2025-05-05 21:45:29.943213,1
+"""
+
+PERFORMANCE_CSVS = {
+    "softmax": SOFTMAX_PERFORMANCE_CSV,
+}
+
+
+@pytest.mark.parametrize("command", ["run"])
+@pytest.mark.parametrize("benchmark", ["softmax"])
+@pytest.mark.parametrize("provider", ["triton", None])
+@pytest.mark.parametrize("n_runs", [None, 1, 2])
+@pytest.mark.parametrize("show_details", [False, True])
+@pytest.mark.parametrize("json_output", [False, True])
+def test_benchmark_run_monkeypatched(
+    command: str,
+    benchmark: str,
+    provider: Optional[str],
+    n_runs: Optional[int],
+    show_details: bool,
+    json_output: bool,
+    capsys,
+):
+    args = [command, benchmark]
+    if provider:
+        args.extend(["--provider", provider])
+    if n_runs:
+        args.extend(["--n_runs", str(n_runs)])
+    if show_details:
+        args.extend(["--show-details"])
+    if json_output:
+        args.extend(["--json"])
+
+    configs = BenchmarkConfigs.from_args(args)
+    for config in configs.configs:
+        config.res_df = pd.read_csv(io.StringIO(PERFORMANCE_CSVS[config.key]))
+    configs.run()
+
+    captured_output = capsys.readouterr().out
+    output_lines = captured_output.splitlines()
+    if provider and not json_output:
+        assert "Selected providers: {'triton': 'Triton'}" in output_lines
+    # Check if the prettified result table have CV column, example - "metric     GB/s   GB/s TFlops TFlops     CV    CV"
+    if show_details and not json_output:
+        assert not show_details or re.search(r"^metric.* CV", captured_output, flags=re.MULTILINE)
@@ -7,6 +7,7 @@
     gemm_tensor_of_ptr_benchmark,
     flash_attention_benchmark,
     flash_attention_tensor_desc_benchmark,
+    prefix_sums,
 )
 
 CONFIGS = [
@@ -29,14 +30,14 @@
         get_benchmark=gemm_tensor_of_ptr_benchmark.get_benchmark,
         run_opts={},
         categories={BenchmarkCategory.EXPERIMENTAL, BenchmarkCategory.GEMM},
-        description="Triton GEMM kernel benchmark - with tensor of pointer",
+        description="GEMM kernel benchmark - with tensor of pointer",
     ),
     BenchmarkConfig(
         key="gemm-tensor-desc",
         get_benchmark=gemm_tensor_desc_benchmark.get_benchmark,
         run_opts={},
         categories={BenchmarkCategory.EXPERIMENTAL, BenchmarkCategory.GEMM},
-        description="Triton GEMM kernel benchmark - with tensor descriptor",
+        description="GEMM kernel benchmark - with tensor descriptor",
     ),
     BenchmarkConfig(
         key="gemm_bt",
@@ -49,19 +50,36 @@
         key="gemm_at",
         get_benchmark=gemm_benchmark.get_benchmark,
         run_opts={"transpose_a": True},
-        categories={BenchmarkCategory.EXPERIMENTAL, BenchmarkCategory.GEMM},
+        categories={BenchmarkCategory.OPTIONAL, BenchmarkCategory.GEMM},
         description="Triton GEMM (A^t@B) kernel benchmark",
     ),
     BenchmarkConfig(
         key="flash_attention",
         get_benchmark=flash_attention_benchmark.get_benchmark,
         run_opts={"fa_kernel_mode": "fwd"},
         categories={BenchmarkCategory.CORE, BenchmarkCategory.FLASH_ATTENTION},
+        description="FlashAttention forward kernel benchmark",
     ),
     BenchmarkConfig(
         key="flash_attention_tensor_desc",
         get_benchmark=flash_attention_tensor_desc_benchmark.get_benchmark,
         run_opts={"fa_kernel_mode": "fwd"},
         categories={BenchmarkCategory.EXPERIMENTAL, BenchmarkCategory.FLASH_ATTENTION},
     ),
+    BenchmarkConfig(
+        key="flash_attention_bwd",
+        get_benchmark=flash_attention_benchmark.get_benchmark,
+        run_opts={"fa_kernel_mode": "bwd"},
+        categories={BenchmarkCategory.OPTIONAL, BenchmarkCategory.FLASH_ATTENTION},
+        description="FlashAttention backward kernel benchmark",
+    ),
+    BenchmarkConfig(
+        key="prefix-sums",
+        get_benchmark=prefix_sums.get_benchmark,
+        run_opts={},
+        categories={BenchmarkCategory.OPTIONAL, BenchmarkCategory.PREFIX_SUMS},
+        description="Prefix Sums kernel benchmark",
+    ),
+    # FIXME: add optional - splitK, streamk, gemm with pre-op or postops, microbenchmarks
+    # FIXME: Experimental - FlexAttention
 ]