From 17c92530a564d436b5df131288a8d571f218c463 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Thu, 18 Sep 2025 11:58:44 +0000
Subject: [PATCH 1/2] Run syclpreview benchmarks once

Prevent syclpreview benchmarks scenarios from running twice.
---
 devops/scripts/benchmarks/benches/compute.py | 81 ++++++++++++++------
 1 file changed, 59 insertions(+), 22 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index b8adfe7948c94..b0cb3412c6973 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -24,6 +24,11 @@ class RUNTIMES(Enum):
     UR = "ur"
 
 
+class PROFILERS(Enum):
+    TIMER = "timer"
+    CPU_COUNTER = "cpuCounter"
+
+
 def runtime_to_name(runtime: RUNTIMES) -> str:
     return {
         RUNTIMES.SYCL_PREVIEW: "SYCL Preview",
@@ -171,27 +176,35 @@ def benchmarks(self) -> list[Benchmark]:
 
         # hand-picked value so that total execution time of the benchmark is
         # similar on all architectures
-        long_lernel_exec_time_ioq = [20]
+        long_kernel_exec_time_ioq = [20]
         # For BMG server, a new value 200 is used, but we have to create metadata
         # for both values to keep the dashboard consistent.
         # See SubmitKernel.enabled()
         long_kernel_exec_time_ooo = [20, 200]
 
-        # The Combo Profiler is available only for selected sycl benchmarks
-        profiler_types = ["timer", "cpuCounter"]
-
         for runtime in list(RUNTIMES):
             # Add SubmitKernel benchmarks using loops
             for in_order_queue in [0, 1]:
                 for measure_completion in [0, 1]:
                     for use_events in [0, 1]:
                         long_kernel_exec_time = (
-                            long_lernel_exec_time_ioq
+                            long_kernel_exec_time_ioq
                             if in_order_queue
                             else long_kernel_exec_time_ooo
                         )
                         for kernel_exec_time in [1, *long_kernel_exec_time]:
-                            for profiler_type in profiler_types:
+                            benches.append(
+                                SubmitKernel(
+                                    self,
+                                    runtime,
+                                    in_order_queue,
+                                    measure_completion,
+                                    use_events,
+                                    kernel_exec_time,
+                                )
+                            )
+                            if runtime == RUNTIMES.SYCL:
+                                # Create CPU count variant
                                 benches.append(
                                     SubmitKernel(
                                         self,
@@ -200,7 +213,7 @@ def benchmarks(self) -> list[Benchmark]:
                                         measure_completion,
                                         use_events,
                                         kernel_exec_time,
-                                        profiler_type,
+                                        profiler_type=PROFILERS.CPU_COUNTER,
                                     )
                                 )
 
@@ -212,8 +225,13 @@ def benchmarks(self) -> list[Benchmark]:
                     )
 
             # Add ULLS benchmarks
-            for profiler_type in profiler_types:
-                benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type))
+            if runtime == RUNTIMES.SYCL:
+                benches.append(
+                    UllsEmptyKernel(
+                        self, runtime, 1000, 256, profiler_type=PROFILERS.CPU_COUNTER
+                    )
+                )
+            benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
             benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
 
             # Add GraphApiSubmitGraph benchmarks
@@ -221,7 +239,19 @@ def benchmarks(self) -> list[Benchmark]:
                 for num_kernels in self.submit_graph_num_kernels:
                     for measure_completion_time in [0, 1]:
                         for use_events in [0, 1]:
-                            for profiler_type in profiler_types:
+                            benches.append(
+                                GraphApiSubmitGraph(
+                                    self,
+                                    runtime,
+                                    in_order_queue,
+                                    num_kernels,
+                                    measure_completion_time,
+                                    use_events,
+                                    useHostTasks=0,
+                                )
+                            )
+                            if runtime == RUNTIMES.SYCL:
+                                # Create CPU count variant
                                 benches.append(
                                     GraphApiSubmitGraph(
                                         self,
@@ -229,9 +259,9 @@ def benchmarks(self) -> list[Benchmark]:
                                         in_order_queue,
                                         num_kernels,
                                         measure_completion_time,
-                                        profiler_type,
                                         use_events,
                                         useHostTasks=0,
+                                        profiler_type=PROFILERS.CPU_COUNTER,
                                     )
                                 )
 
@@ -244,7 +274,7 @@ def benchmarks(self) -> list[Benchmark]:
             GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"),
             GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"),
         ]
-        for profiler_type in profiler_types:
+        for profiler_type in list(PROFILERS):
             benches.append(
                 QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type)
             )
@@ -310,7 +340,12 @@ def parse_unit_type(compute_unit):
 
 class ComputeBenchmark(Benchmark):
     def __init__(
-        self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = ""
+        self,
+        bench,
+        name,
+        test,
+        runtime: RUNTIMES = None,
+        profiler_type: PROFILERS = PROFILERS.TIMER,
     ):
         super().__init__(bench.directory, bench)
         self.bench = bench
@@ -478,7 +513,7 @@ def __init__(
         MeasureCompletion=0,
         UseEvents=0,
         KernelExecTime=1,
-        profiler_type="",
+        profiler_type=PROFILERS.TIMER,
     ):
         self.ioq = ioq
         self.MeasureCompletion = MeasureCompletion
@@ -578,7 +613,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--UseEvents={self.UseEvents}",
         ]
         if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type}")
+            bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args
 
     def get_metadata(self) -> dict[str, BenchmarkMetadata]:
@@ -647,7 +682,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--dst={self.destination}",
             f"--size={self.size}",
             "--withCopyOffload=0",
-            f"--profilerType={self.profiler_type}",
+            f"--profilerType={self.profiler_type.value}",
         ]
 
 
@@ -693,7 +728,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--size={self.size}",
             "--count=100",
             "--withCopyOffload=0",
-            f"--profilerType={self.profiler_type}",
+            f"--profilerType={self.profiler_type.value}",
         ]
 
 
@@ -731,7 +766,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--sourcePlacement={self.source}",
             f"--destinationPlacement={self.destination}",
             f"--size={self.size}",
-            f"--profilerType={self.profiler_type}",
+            f"--profilerType={self.profiler_type.value}",
         ]
 
 
@@ -970,9 +1005,9 @@ def __init__(
         inOrderQueue,
         numKernels,
         measureCompletionTime,
-        profiler_type,
         useEvents,
         useHostTasks,
+        profiler_type=PROFILERS.TIMER,
     ):
         self.inOrderQueue = inOrderQueue
         self.numKernels = numKernels
@@ -1037,12 +1072,14 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--UseHostTasks={self.useHostTasks}",
         ]
         if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type}")
+            bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args
 
 
 class UllsEmptyKernel(ComputeBenchmark):
-    def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type):
+    def __init__(
+        self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type=PROFILERS.TIMER
+    ):
         self.wgc = wgc
         self.wgs = wgs
         # iterations per bin_args: --iterations=10000
@@ -1084,7 +1121,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--wgc={self.wgc}",
         ]
         if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type}")
+            bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args
 
 

From dfb4e996c470a7029a3fe7c76a83db2e75297e9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Thu, 18 Sep 2025 13:42:27 +0000
Subject: [PATCH 2/2] Refactor nested loops in ComputeBench benches
 instantiation

---
 devops/scripts/benchmarks/benches/compute.py | 161 +++++++++++--------
 1 file changed, 92 insertions(+), 69 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index b0cb3412c6973..dcb23e392dc35 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -3,6 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from itertools import product
 import os
 import csv
 import io
@@ -182,49 +183,61 @@ def benchmarks(self) -> list[Benchmark]:
         # See SubmitKernel.enabled()
         long_kernel_exec_time_ooo = [20, 200]
 
-        for runtime in list(RUNTIMES):
-            # Add SubmitKernel benchmarks using loops
-            for in_order_queue in [0, 1]:
-                for measure_completion in [0, 1]:
-                    for use_events in [0, 1]:
-                        long_kernel_exec_time = (
-                            long_kernel_exec_time_ioq
-                            if in_order_queue
-                            else long_kernel_exec_time_ooo
-                        )
-                        for kernel_exec_time in [1, *long_kernel_exec_time]:
-                            benches.append(
-                                SubmitKernel(
-                                    self,
-                                    runtime,
-                                    in_order_queue,
-                                    measure_completion,
-                                    use_events,
-                                    kernel_exec_time,
-                                )
-                            )
-                            if runtime == RUNTIMES.SYCL:
-                                # Create CPU count variant
-                                benches.append(
-                                    SubmitKernel(
-                                        self,
-                                        runtime,
-                                        in_order_queue,
-                                        measure_completion,
-                                        use_events,
-                                        kernel_exec_time,
-                                        profiler_type=PROFILERS.CPU_COUNTER,
-                                    )
-                                )
-
-            # Add SinKernelGraph benchmarks
-            for with_graphs in [0, 1]:
-                for num_kernels in [5, 100]:
+        submit_kernel_params = product(
+            list(RUNTIMES),
+            [0, 1],  # in_order_queue
+            [0, 1],  # measure_completion
+            [0, 1],  # use_events
+        )
+        for (
+            runtime,
+            in_order_queue,
+            measure_completion,
+            use_events,
+        ) in submit_kernel_params:
+            long_kernel_exec_time = (
+                long_kernel_exec_time_ioq
+                if in_order_queue
+                else long_kernel_exec_time_ooo
+            )
+            for kernel_exec_time in [1, *long_kernel_exec_time]:
+                benches.append(
+                    SubmitKernel(
+                        self,
+                        runtime,
+                        in_order_queue,
+                        measure_completion,
+                        use_events,
+                        kernel_exec_time,
+                    )
+                )
+                if runtime == RUNTIMES.SYCL:
+                    # Create CPU count variant
                     benches.append(
-                        GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
+                        SubmitKernel(
+                            self,
+                            runtime,
+                            in_order_queue,
+                            measure_completion,
+                            use_events,
+                            kernel_exec_time,
+                            profiler_type=PROFILERS.CPU_COUNTER,
+                        )
                     )
 
+        # Add SinKernelGraph benchmarks
+        sin_kernel_graph_params = product(
+            list(RUNTIMES),
+            [0, 1],  # with_graphs
+            [5, 100],  # num_kernels
+        )
+        for runtime, with_graphs, num_kernels in sin_kernel_graph_params:
+            benches.append(
+                GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
+            )
+
             # Add ULLS benchmarks
+        for runtime in list(RUNTIMES):
             if runtime == RUNTIMES.SYCL:
                 benches.append(
                     UllsEmptyKernel(
@@ -234,36 +247,46 @@ def benchmarks(self) -> list[Benchmark]:
             benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
             benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
 
-            # Add GraphApiSubmitGraph benchmarks
-            for in_order_queue in [0, 1]:
-                for num_kernels in self.submit_graph_num_kernels:
-                    for measure_completion_time in [0, 1]:
-                        for use_events in [0, 1]:
-                            benches.append(
-                                GraphApiSubmitGraph(
-                                    self,
-                                    runtime,
-                                    in_order_queue,
-                                    num_kernels,
-                                    measure_completion_time,
-                                    use_events,
-                                    useHostTasks=0,
-                                )
-                            )
-                            if runtime == RUNTIMES.SYCL:
-                                # Create CPU count variant
-                                benches.append(
-                                    GraphApiSubmitGraph(
-                                        self,
-                                        runtime,
-                                        in_order_queue,
-                                        num_kernels,
-                                        measure_completion_time,
-                                        use_events,
-                                        useHostTasks=0,
-                                        profiler_type=PROFILERS.CPU_COUNTER,
-                                    )
-                                )
+        # Add GraphApiSubmitGraph benchmarks
+        submit_graph_params = product(
+            list(RUNTIMES),
+            [0, 1],  # in_order_queue
+            self.submit_graph_num_kernels,
+            [0, 1],  # measure_completion_time
+            [0, 1],  # use_events
+        )
+        for (
+            runtime,
+            in_order_queue,
+            num_kernels,
+            measure_completion_time,
+            use_events,
+        ) in submit_graph_params:
+            benches.append(
+                GraphApiSubmitGraph(
+                    self,
+                    runtime,
+                    in_order_queue,
+                    num_kernels,
+                    measure_completion_time,
+                    use_events,
+                    useHostTasks=0,
+                )
+            )
+            if runtime == RUNTIMES.SYCL:
+                # Create CPU count variant
+                benches.append(
+                    GraphApiSubmitGraph(
+                        self,
+                        runtime,
+                        in_order_queue,
+                        num_kernels,
+                        measure_completion_time,
+                        use_events,
+                        useHostTasks=0,
+                        profiler_type=PROFILERS.CPU_COUNTER,
+                    )
+                )
 
         # Add other benchmarks
         benches += [