[Benchmarks] Use combo profiler in UR SubmitKernel scenarios (#20295)

PatKamin · web-flow · commit 93a515d0fb6b · 2025-10-16T18:59:04.000Z
Use the Combo Profiler in all supported Compute Benchmark scenarios.
~~Introduce the `--profiler-type` parameter for picking one profiler
each run. This effectively disables running the same scenarios two
times: with time measurement and with CPU instructions retired
counter.~~

~~Leaving by default `CPU counter` scenarios enabled in CI. This will
result in less scenarios being ran as we won't run time-measuring
scenarios where CPU counter is available. In the dashboard, only
historic results will be drawn, no new runs in such cases.~~

The `CPU count` groups and names suffix can now be safely determined
beforehand, without having to parse run logs.

Also, a Compute Benchmarks version bump is made for these changes.
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
@@ -61,8 +61,8 @@ def git_url(self) -> str:
         return "https://github.com/intel/compute-benchmarks.git"
 
     def git_hash(self) -> str:
-        # Sep 25, 2025
-        return "7ba2e629404e34c635a46f28550a0952717d120f"
+        # Oct 9, 2025
+        return "32805b4b6f8dafb4a97f21c4c85bb2f6963f8dbb"
 
     def setup(self) -> None:
         if options.sycl is None:
@@ -152,7 +152,7 @@ def benchmarks(self) -> list[Benchmark]:
                         kernel_exec_time,
                     )
                 )
-                if runtime == RUNTIMES.SYCL:
+                if runtime in (RUNTIMES.SYCL, RUNTIMES.SYCL_PREVIEW, RUNTIMES.UR):
                     # Create CPU count variant
                     benches.append(
                         SubmitKernel(
@@ -203,6 +203,11 @@ def benchmarks(self) -> list[Benchmark]:
             measure_completion_time,
             use_events,
         ) in submit_graph_params:
+            # Non-sycl runtimes have to be run with emulated graphs,
+            # see: https://github.com/intel/compute-benchmarks/commit/d81d5d602739482b9070c872a28c0b5ebb41de70
+            emulate_graphs = (
+                0 if runtime in (RUNTIMES.SYCL, RUNTIMES.SYCL_PREVIEW) else 1
+            )
             benches.append(
                 GraphApiSubmitGraph(
                     self,
@@ -211,6 +216,7 @@ def benchmarks(self) -> list[Benchmark]:
                     num_kernels,
                     measure_completion_time,
                     use_events,
+                    emulate_graphs,
                     useHostTasks=0,
                 )
             )
@@ -224,6 +230,7 @@ def benchmarks(self) -> list[Benchmark]:
                         num_kernels,
                         measure_completion_time,
                         use_events,
+                        emulate_graphs,
                         useHostTasks=0,
                         profiler_type=PROFILERS.CPU_COUNTER,
                     )
@@ -294,14 +301,6 @@ def benchmarks(self) -> list[Benchmark]:
         return benches
 
 
-def parse_unit_type(compute_unit):
-    if "[count]" in compute_unit:
-        return "instr"
-    elif "[us]" in compute_unit:
-        return "μs"
-    return compute_unit.replace("[", "").replace("]", "")
-
-
 class ComputeBenchmark(Benchmark):
     def __init__(
         self,
@@ -330,6 +329,17 @@ def benchmark_bin(self) -> Path:
         """Returns the path to the benchmark binary"""
         return self.bench.project.build_dir / "bin" / self.bench_name
 
+    def cpu_count_str(self, separator: str = "") -> str:
+        # Note: SYCL CI currently relies on this "CPU count" value.
+        # Please update /devops/scripts/benchmarks/compare.py if this value
+        # is changed. See compare.py usage (w.r.t. --regression-filter) in
+        # /devops/actions/run-tests/benchmarks/action.yml.
+        return (
+            f"{separator} CPU count"
+            if self.profiler_type == PROFILERS.CPU_COUNTER
+            else ""
+        )
+
     def get_iters(self, run_trace: TracingType):
         """Returns the number of iterations to run for the given tracing type."""
         return (
@@ -412,27 +422,23 @@ def run(
         )
         parsed_results = self.parse_output(result)
         ret = []
-        for label, median, stddev, unit in parsed_results:
-            extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
-            # Note: SYCL CI currently relies on this "CPU count" value.
-            # Please update /devops/scripts/benchmarks/compare.py if this value
-            # is changed. See compare.py usage (w.r.t. --regression-filter) in
-            # /devops/actions/run-tests/benchmarks/action.yml.
+        for median, stddev in parsed_results:
+            unit = "instr" if self.profiler_type == PROFILERS.CPU_COUNTER else "μs"
             ret.append(
                 Result(
-                    label=self.name() + extra_label,
+                    label=self.name(),
                     value=median,
                     stddev=stddev,
                     command=command,
                     env=env_vars,
-                    unit=parse_unit_type(unit),
+                    unit=unit,
                     git_url=self.bench.git_url(),
                     git_hash=self.bench.git_hash(),
                 )
             )
         return ret
 
-    def parse_output(self, output):
+    def parse_output(self, output: str) -> list[tuple[float, float]]:
         csv_file = io.StringIO(output)
         reader = csv.reader(csv_file)
         next(reader, None)
@@ -442,16 +448,14 @@ def parse_output(self, output):
             if data_row is None:
                 break
             try:
-                label = data_row[0]
                 mean = float(data_row[1])
                 median = float(data_row[2])
                 # compute benchmarks report stddev as %
                 stddev = mean * (float(data_row[3].strip("%")) / 100.0)
                 if not math.isfinite(stddev):
                     stddev = 0.0  # Default to 0.0 if stddev is invalid
 
-                unit = data_row[7]
-                results.append((label, median, stddev, unit))
+                results.append((median, stddev))
             except (ValueError, IndexError) as e:
                 raise ValueError(f"Error parsing output: {e}")
         if len(results) == 0:
@@ -532,7 +536,7 @@ def name(self):
             f" KernelExecTime={self.KernelExecTime}" if self.KernelExecTime != 1 else ""
         )
 
-        return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}"
+        return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
         order = "in order" if self.ioq else "out of order"
@@ -544,7 +548,7 @@ def display_name(self) -> str:
         if self.KernelExecTime != 1:
             info.append(f"KernelExecTime={self.KernelExecTime}")
         additional_info = f" {' '.join(info)}" if info else ""
-        return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}"
+        return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}{self.cpu_count_str(separator=',')}"
 
     def explicit_group(self):
         order = "in order" if self.ioq else "out of order"
@@ -553,7 +557,7 @@ def explicit_group(self):
 
         kernel_exec_time_str = f" long kernel" if self.KernelExecTime != 1 else ""
 
-        return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}"
+        return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         order = "in-order" if self.ioq else "out-of-order"
@@ -571,34 +575,16 @@ def range(self) -> tuple[float, float]:
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        bin_args = [
+        return [
             f"--iterations={iters}",
             f"--Ioq={self.ioq}",
             f"--MeasureCompletion={self.MeasureCompletion}",
             "--Profiling=0",
             f"--NumKernels={self.NumKernels}",
             f"--KernelExecTime={self.KernelExecTime}",
             f"--UseEvents={self.UseEvents}",
+            f"--profilerType={self.profiler_type.value}",
         ]
-        if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type.value}")
-        return bin_args
-
-    def get_metadata(self) -> dict[str, BenchmarkMetadata]:
-        metadata_dict = super().get_metadata()
-
-        # Create CPU count variant with modified display name and explicit_group
-        cpu_count_name = self.name() + " CPU count"
-        cpu_count_metadata = copy.deepcopy(metadata_dict[self.name()])
-        cpu_count_display_name = self.display_name() + ", CPU count"
-        cpu_count_explicit_group = (
-            self.explicit_group() + ", CPU count" if self.explicit_group() else ""
-        )
-        cpu_count_metadata.display_name = cpu_count_display_name
-        cpu_count_metadata.explicit_group = cpu_count_explicit_group
-        metadata_dict[cpu_count_name] = cpu_count_metadata
-
-        return metadata_dict
 
 
 class ExecImmediateCopyQueue(ComputeBenchmark):
@@ -622,11 +608,11 @@ def __init__(
 
     def name(self):
         order = "in order" if self.ioq else "out of order"
-        return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
+        return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
         order = "in order" if self.ioq else "out of order"
-        return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
+        return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         order = "in-order" if self.ioq else "out-of-order"
@@ -671,10 +657,10 @@ def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
         )
 
     def name(self):
-        return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         operation = "copy-only" if self.isCopyOnly else "copy and command submission"
@@ -713,10 +699,10 @@ def __init__(self, bench, source, destination, size, profiler_type):
         )
 
     def name(self):
-        return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         return (
@@ -974,6 +960,7 @@ def __init__(
         numKernels,
         measureCompletionTime,
         useEvents,
+        emulate_graphs,
         useHostTasks,
         profiler_type=PROFILERS.TIMER,
     ):
@@ -982,6 +969,7 @@ def __init__(
         self.measureCompletionTime = measureCompletionTime
         self.useEvents = useEvents
         self.useHostTasks = useHostTasks
+        self.emulateGraphs = emulate_graphs
         self.ioq_str = "in order" if self.inOrderQueue else "out of order"
         self.measure_str = (
             " with measure completion" if self.measureCompletionTime else ""
@@ -1003,7 +991,7 @@ def supported_runtimes(self) -> list[RUNTIMES]:
         return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
 
     def explicit_group(self):
-        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels"
+        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         return (
@@ -1012,10 +1000,10 @@ def description(self) -> str:
         )
 
     def name(self):
-        return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}"
+        return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels"
+        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
 
     def get_tags(self):
         return [
@@ -1028,7 +1016,7 @@ def get_tags(self):
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        bin_args = [
+        return [
             f"--iterations={iters}",
             f"--NumKernels={self.numKernels}",
             f"--MeasureCompletionTime={self.measureCompletionTime}",
@@ -1038,26 +1026,9 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--UseEvents={self.useEvents}",
             "--UseExplicit=0",
             f"--UseHostTasks={self.useHostTasks}",
+            f"--profilerType={self.profiler_type.value}",
+            f"--EmulateGraphs={self.emulateGraphs}",
         ]
-        if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type.value}")
-        return bin_args
-
-    def get_metadata(self) -> dict[str, BenchmarkMetadata]:
-        metadata_dict = super().get_metadata()
-
-        # Create CPU count variant with modified display name and explicit_group
-        cpu_count_name = self.name() + " CPU count"
-        cpu_count_metadata = copy.deepcopy(metadata_dict[self.name()])
-        cpu_count_display_name = self.display_name() + ", CPU count"
-        cpu_count_explicit_group = (
-            self.explicit_group() + ", CPU count" if self.explicit_group() else ""
-        )
-        cpu_count_metadata.display_name = cpu_count_display_name
-        cpu_count_metadata.explicit_group = cpu_count_explicit_group
-        metadata_dict[cpu_count_name] = cpu_count_metadata
-
-        return metadata_dict
 
 
 class UllsEmptyKernel(ComputeBenchmark):
@@ -1081,32 +1052,28 @@ def supported_runtimes(self) -> list[RUNTIMES]:
         return [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
 
     def explicit_group(self):
-        return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}"
+        return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         return ""
 
     def name(self):
-        return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}"
+        return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return (
-            f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}"
-        )
+        return f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}{self.cpu_count_str(separator=',')}"
 
     def get_tags(self):
         return [runtime_to_tag_name(self.runtime), "micro", "latency", "submit"]
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        bin_args = [
+        return [
             f"--iterations={iters}",
             f"--wgs={self.wgs}",
             f"--wgc={self.wgc}",
+            f"--profilerType={self.profiler_type.value}",
         ]
-        if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type.value}")
-        return bin_args
 
 
 class UllsKernelSwitch(ComputeBenchmark):