From af11dfae550a49af961505bf4318b5a731f0062b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Tue, 30 Sep 2025 11:52:08 +0000
Subject: [PATCH 01/11] [Benchmarks] Use combo profiler in UR SubmitKernel
 scenarios

---
 devops/scripts/benchmarks/benches/compute.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index 950321ba594bc..c99a7a7637812 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -61,8 +61,8 @@ def git_url(self) -> str:
         return "https://github.com/intel/compute-benchmarks.git"
 
     def git_hash(self) -> str:
-        # Sep 25, 2025
-        return "7ba2e629404e34c635a46f28550a0952717d120f"
+        # Sep 26, 2025
+        return "db0d6708a37de69d844d6521ca06ef35b1cc55af"
 
     def setup(self) -> None:
         if options.sycl is None:
@@ -152,7 +152,7 @@ def benchmarks(self) -> list[Benchmark]:
                         kernel_exec_time,
                     )
                 )
-                if runtime == RUNTIMES.SYCL:
+                if runtime in (RUNTIMES.SYCL, RUNTIMES.UR):
                     # Create CPU count variant
                     benches.append(
                         SubmitKernel(
@@ -576,7 +576,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--KernelExecTime={self.KernelExecTime}",
             f"--UseEvents={self.UseEvents}",
         ]
-        if self.runtime == RUNTIMES.SYCL:
+        if self.runtime == RUNTIMES.SYCL or self.runtime == RUNTIMES.UR:
             bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args
 

From 65d85c8783b08c818c88413d1595cdb2362a0d22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Tue, 30 Sep 2025 12:48:07 +0000
Subject: [PATCH 02/11] [Benchmarks] Measure cpu instructions optionally

It makes sense to measure either time elapsed or cpu instructions
retired, not both. Adding an argument to scripts to run Compute
Benchmarks scenarios with only one of two scenarios would lower
the number of benchmark scenarios significantly. This would make
the tests to take less amount of time to complete, giving the user an
option to produce either time or cpu instructions count results,
where applicable.
---
 .github/workflows/sycl-linux-precommit.yml    |  1 +
 .github/workflows/sycl-linux-run-tests.yml    |  8 ++
 .../workflows/sycl-nightly-benchmarking.yml   |  1 +
 .../workflows/sycl-ur-perf-benchmarking.yml   | 10 ++
 devops/actions/run-tests/benchmark/action.yml | 19 +++-
 devops/scripts/benchmarks/benches/compute.py  | 99 +++++++++++++------
 devops/scripts/benchmarks/main.py             |  9 ++
 devops/scripts/benchmarks/options.py          |  1 +
 8 files changed, 116 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml
index 12986e98617fc..0fa4d28e904bc 100644
--- a/.github/workflows/sycl-linux-precommit.yml
+++ b/.github/workflows/sycl-linux-precommit.yml
@@ -241,6 +241,7 @@ jobs:
       benchmark_upload_results: false
       benchmark_preset: 'Minimal'
       benchmark_dry_run: true
+      benchmark_profiler_type: 'cpuCounter'
       repo_ref: ${{ github.sha }}
       toolchain_artifact: ${{ needs.build.outputs.toolchain_artifact }}
       toolchain_artifact_filename: ${{ needs.build.outputs.toolchain_artifact_filename }}
diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml
index de81af128d58d..b0947f6521350 100644
--- a/.github/workflows/sycl-linux-run-tests.yml
+++ b/.github/workflows/sycl-linux-run-tests.yml
@@ -140,6 +140,13 @@ on:
         type: string
         default: 'false'
         required: False
+      benchmark_profiler_type:
+        description: |
+          Type of profiler to use for benchmarks. Options are "timer" and
+          "cpuCounter". Default is "cpuCounter".
+        type: string
+        default: 'cpuCounter'
+        required: False
 
   workflow_dispatch:
     inputs:
@@ -359,6 +366,7 @@ jobs:
         preset: ${{ inputs.benchmark_preset }}
         dry_run: ${{ inputs.benchmark_dry_run }}
         build_ref: ${{ inputs.repo_ref }}
+        profiler_type: ${{ inputs.benchmark_profiler_type }}
       env:
         RUNNER_TAG: ${{ inputs.runner }}
         GITHUB_TOKEN: ${{ secrets.LLVM_SYCL_BENCHMARK_TOKEN }}
diff --git a/.github/workflows/sycl-nightly-benchmarking.yml b/.github/workflows/sycl-nightly-benchmarking.yml
index 9d1098ee68b33..df7b60928fdee 100644
--- a/.github/workflows/sycl-nightly-benchmarking.yml
+++ b/.github/workflows/sycl-nightly-benchmarking.yml
@@ -45,6 +45,7 @@ jobs:
       benchmark_upload_results: true
       benchmark_save_name: ${{ matrix.save_name }}
       benchmark_preset: ${{ matrix.preset }}
+      benchmark_profiler_type: cpuCounter
       repo_ref: ${{ matrix.ref }}
       toolchain_artifact: ${{ needs.ubuntu2204_build.outputs.toolchain_artifact }}
       toolchain_artifact_filename: ${{ needs.ubuntu2204_build.outputs.toolchain_artifact_filename }}
diff --git a/.github/workflows/sycl-ur-perf-benchmarking.yml b/.github/workflows/sycl-ur-perf-benchmarking.yml
index b5b14cf34ade1..11a36efa5b838 100644
--- a/.github/workflows/sycl-ur-perf-benchmarking.yml
+++ b/.github/workflows/sycl-ur-perf-benchmarking.yml
@@ -61,6 +61,15 @@ on:
           - Normal
           - Test
         default: 'Minimal'  # Only compute-benchmarks
+      benchmark_profiler_type:
+        description: |
+          Type of profiler to use for benchmarks. Options are "timer" and
+          "cpuCounter". Default is "cpuCounter".
+        type: choice
+        options:
+          - timer
+          - cpuCounter
+        default: 'cpuCounter'
       pr_no:
         type: string
         description: |
@@ -192,6 +201,7 @@ jobs:
       benchmark_upload_results: ${{ inputs.upload_results }}
       benchmark_save_name: ${{ needs.sanitize_inputs.outputs.benchmark_save_name }}
       benchmark_preset: ${{ inputs.preset }}
+      benchmark_profiler_type: ${{ inputs.benchmark_profiler_type }}
       repo_ref: ${{ needs.sanitize_inputs.outputs.build_ref }}
       toolchain_artifact: ${{ needs.build_sycl.outputs.toolchain_artifact }}
       toolchain_artifact_filename: ${{ needs.build_sycl.outputs.toolchain_artifact_filename }}
diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml
index add36688f94b5..3898137dfba8b 100644
--- a/devops/actions/run-tests/benchmark/action.yml
+++ b/devops/actions/run-tests/benchmark/action.yml
@@ -32,6 +32,10 @@ inputs:
   dry_run:
     type: string
     required: False
+  profiler_type:
+    type: string
+    required: False
+    default: "cpuCounter"  # Other option is "timer"
 
 runs:
   using: "composite"
@@ -41,6 +45,7 @@ runs:
     env:
       TARGET_DEVICE: ${{ inputs.target_devices }}
       PRESET: ${{ inputs.preset }}
+      PROFILER_TYPE: ${{ inputs.profiler_type }}
     run: |
       case "$RUNNER_TAG" in
         '["PVC_PERF"]' ) ;;
@@ -75,6 +80,17 @@ runs:
       python3 ./devops/scripts/benchmarks/presets.py query "$PRESET"
       [ "$?" -ne 0 ] && exit 1  # Stop workflow if invalid preset
       echo "PRESET=$PRESET" >> $GITHUB_ENV
+
+      # Validate profiler type input
+      case "$PROFILER_TYPE" in
+        "timer") PROFILER_TYPE="timer" ;;
+        "cpuCounter") PROFILER_TYPE="cpuCounter" ;;
+      *) 
+        echo "Invalid profiler type specified: $PROFILER_TYPE"
+        exit 1
+        ;;
+      esac
+      echo "PROFILER_TYPE=$PROFILER_TYPE" >> $GITHUB_ENV
   - name: Compute CPU core range to run benchmarks on
     shell: bash
     run: |
@@ -204,7 +220,8 @@ runs:
         --preset "$PRESET" \
         --timestamp-override "$SAVE_TIMESTAMP" \
         --detect-version sycl,compute_runtime \
-        --flamegraph inclusive
+        --flamegraph inclusive \
+        --profiler-type "$PROFILER_TYPE"
 
       echo "-----"
       python3 ./devops/scripts/benchmarks/compare.py to_hist \
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index c99a7a7637812..88aa40d90671d 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -330,6 +330,13 @@ def benchmark_bin(self) -> Path:
         """Returns the path to the benchmark binary"""
         return self.bench.project.build_dir / "bin" / self.bench_name
 
+    def cpu_count_str(self, separator: str = " ") -> str:
+        return (
+            f"{separator}CPU count"
+            if self.profiler_type == PROFILERS.CPU_COUNTER
+            else ""
+        )
+
     def get_iters(self, run_trace: TracingType):
         """Returns the number of iterations to run for the given tracing type."""
         return (
@@ -499,11 +506,16 @@ def supported_runtimes(self) -> list[RUNTIMES]:
         return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
 
     def enabled(self) -> bool:
-        # This is a workaround for the BMG server where we have old results for self.KernelExecTime=20
-        # The benchmark instance gets created just to make metadata for these old results
         if not super().enabled():
             return False
 
+        if (
+            self.runtime in (RUNTIMES.SYCL, RUNTIMES.UR)
+        ) and options.profiler_type != self.profiler_type.value:
+            return False
+
+        # This is a workaround for the BMG server where we have old results for self.KernelExecTime=20
+        # The benchmark instance gets created just to make metadata for these old results
         device_arch = getattr(options, "device_architecture", "")
         if "bmg" in device_arch and self.KernelExecTime == 20:
             # Disable this benchmark for BMG server, just create metadata
@@ -528,7 +540,7 @@ def name(self):
             f" KernelExecTime={self.KernelExecTime}" if self.KernelExecTime != 1 else ""
         )
 
-        return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}"
+        return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
         order = "in order" if self.ioq else "out of order"
@@ -540,7 +552,7 @@ def display_name(self) -> str:
         if self.KernelExecTime != 1:
             info.append(f"KernelExecTime={self.KernelExecTime}")
         additional_info = f" {' '.join(info)}" if info else ""
-        return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}"
+        return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}{self.cpu_count_str(', ')}"
 
     def explicit_group(self):
         order = "in order" if self.ioq else "out of order"
@@ -549,7 +561,7 @@ def explicit_group(self):
 
         kernel_exec_time_str = f" long kernel" if self.KernelExecTime != 1 else ""
 
-        return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}"
+        return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str(', ')}"
 
     def description(self) -> str:
         order = "in-order" if self.ioq else "out-of-order"
@@ -567,7 +579,7 @@ def range(self) -> tuple[float, float]:
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        bin_args = [
+        return [
             f"--iterations={iters}",
             f"--Ioq={self.ioq}",
             f"--MeasureCompletion={self.MeasureCompletion}",
@@ -575,10 +587,8 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--NumKernels={self.NumKernels}",
             f"--KernelExecTime={self.KernelExecTime}",
             f"--UseEvents={self.UseEvents}",
+            f"--profilerType={self.profiler_type.value}",
         ]
-        if self.runtime == RUNTIMES.SYCL or self.runtime == RUNTIMES.UR:
-            bin_args.append(f"--profilerType={self.profiler_type.value}")
-        return bin_args
 
     def get_metadata(self) -> dict[str, BenchmarkMetadata]:
         metadata_dict = super().get_metadata()
@@ -616,13 +626,18 @@ def __init__(
             profiler_type=profiler_type,
         )
 
+    def enabled(self) -> bool:
+        if options.profiler_type != self.profiler_type.value:
+            return False
+        return super().enabled()
+
     def name(self):
         order = "in order" if self.ioq else "out of order"
-        return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
+        return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
         order = "in order" if self.ioq else "out of order"
-        return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
+        return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
 
     def description(self) -> str:
         order = "in-order" if self.ioq else "out-of-order"
@@ -666,11 +681,16 @@ def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
             profiler_type=profiler_type,
         )
 
+    def enabled(self) -> bool:
+        if options.profiler_type != self.profiler_type.value:
+            return False
+        return super().enabled()
+
     def name(self):
-        return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
 
     def description(self) -> str:
         operation = "copy-only" if self.isCopyOnly else "copy and command submission"
@@ -708,11 +728,16 @@ def __init__(self, bench, source, destination, size, profiler_type):
             bench, "memory_benchmark_sycl", "QueueMemcpy", profiler_type=profiler_type
         )
 
+    def enabled(self) -> bool:
+        if options.profiler_type != self.profiler_type.value:
+            return False
+        return super().enabled()
+
     def name(self):
-        return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
+        return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
 
     def description(self) -> str:
         return (
@@ -998,8 +1023,16 @@ def __init__(
     def supported_runtimes(self) -> list[RUNTIMES]:
         return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
 
+    def enabled(self) -> bool:
+        if (
+            self.runtime == RUNTIMES.SYCL
+            and options.profiler_type != self.profiler_type.value
+        ):
+            return False
+        return super().enabled()
+
     def explicit_group(self):
-        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels"
+        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(', ')}"
 
     def description(self) -> str:
         return (
@@ -1008,10 +1041,10 @@ def description(self) -> str:
         )
 
     def name(self):
-        return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}"
+        return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels"
+        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(', ')}"
 
     def get_tags(self):
         return [
@@ -1024,7 +1057,7 @@ def get_tags(self):
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        bin_args = [
+        return [
             f"--iterations={iters}",
             f"--NumKernels={self.numKernels}",
             f"--MeasureCompletionTime={self.measureCompletionTime}",
@@ -1034,10 +1067,8 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--UseEvents={self.useEvents}",
             "--UseExplicit=0",
             f"--UseHostTasks={self.useHostTasks}",
+            f"--profilerType={self.profiler_type.value}",
         ]
-        if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type.value}")
-        return bin_args
 
     def get_metadata(self) -> dict[str, BenchmarkMetadata]:
         metadata_dict = super().get_metadata()
@@ -1076,33 +1107,39 @@ def __init__(
     def supported_runtimes(self) -> list[RUNTIMES]:
         return [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
 
+    def enabled(self) -> bool:
+        if (
+            self.runtime == RUNTIMES.SYCL
+            and options.profiler_type != self.profiler_type.value
+        ):
+            return False
+        return super().enabled()
+
     def explicit_group(self):
-        return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}"
+        return (
+            f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}{self.cpu_count_str(', ')}"
+        )
 
     def description(self) -> str:
         return ""
 
     def name(self):
-        return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}"
+        return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return (
-            f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}"
-        )
+        return f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}{self.cpu_count_str(', ')}"
 
     def get_tags(self):
         return [runtime_to_tag_name(self.runtime), "micro", "latency", "submit"]
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        bin_args = [
+        return [
             f"--iterations={iters}",
             f"--wgs={self.wgs}",
             f"--wgc={self.wgc}",
+            f"--profilerType={self.profiler_type.value}",
         ]
-        if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type.value}")
-        return bin_args
 
 
 class UllsKernelSwitch(ComputeBenchmark):
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index 7bf6919915cc4..27e49c0d27f86 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -689,6 +689,14 @@ def validate_and_parse_env_args(env_args):
         help="Set the logging level",
         default="info",
     )
+    parser.add_argument(
+        "--profiler-type",
+        type=str,
+        choices=["timer", "cpuCounter"],
+        help="Set the profiler type for benchmarks. 'timer' measures execution time, "
+        "'cpuCounter' measures CPU instruction count for supported benchmarks.",
+        default="timer",
+    )
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
@@ -720,6 +728,7 @@ def validate_and_parse_env_args(env_args):
     options.build_jobs = args.build_jobs
     options.hip_arch = args.hip_arch
     options.flamegraph = args.flamegraph is not None
+    options.profiler_type = args.profiler_type
 
     # Initialize logger with command line arguments
     log.initialize(args.verbose, args.log_level)
diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py
index f17d96313badb..1d1cd889510fc 100644
--- a/devops/scripts/benchmarks/options.py
+++ b/devops/scripts/benchmarks/options.py
@@ -74,6 +74,7 @@ class Options:
     exit_on_failure: bool = False
     flamegraph: bool = False
     unitrace: bool = False
+    profiler_type: str = "timer"
 
     # Options intended for CI:
 

From ee9bd835759981c4196a18a5cbf86933e1bf5404 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Tue, 7 Oct 2025 12:59:29 +0000
Subject: [PATCH 03/11] More explicit 'CPU count' separator usage

---
 devops/scripts/benchmarks/benches/compute.py | 24 +++++++++-----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index 88aa40d90671d..a74f6161eacd1 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -330,9 +330,9 @@ def benchmark_bin(self) -> Path:
         """Returns the path to the benchmark binary"""
         return self.bench.project.build_dir / "bin" / self.bench_name
 
-    def cpu_count_str(self, separator: str = " ") -> str:
+    def cpu_count_str(self, separator: str = "") -> str:
         return (
-            f"{separator}CPU count"
+            f"{separator} CPU count"
             if self.profiler_type == PROFILERS.CPU_COUNTER
             else ""
         )
@@ -552,7 +552,7 @@ def display_name(self) -> str:
         if self.KernelExecTime != 1:
             info.append(f"KernelExecTime={self.KernelExecTime}")
         additional_info = f" {' '.join(info)}" if info else ""
-        return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}{self.cpu_count_str(', ')}"
+        return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}{self.cpu_count_str(separator=',')}"
 
     def explicit_group(self):
         order = "in order" if self.ioq else "out of order"
@@ -561,7 +561,7 @@ def explicit_group(self):
 
         kernel_exec_time_str = f" long kernel" if self.KernelExecTime != 1 else ""
 
-        return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str(', ')}"
+        return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         order = "in-order" if self.ioq else "out-of-order"
@@ -637,7 +637,7 @@ def name(self):
 
     def display_name(self) -> str:
         order = "in order" if self.ioq else "out of order"
-        return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
+        return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         order = "in-order" if self.ioq else "out-of-order"
@@ -690,7 +690,7 @@ def name(self):
         return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
+        return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         operation = "copy-only" if self.isCopyOnly else "copy and command submission"
@@ -737,7 +737,7 @@ def name(self):
         return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(', ')}"
+        return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         return (
@@ -1032,7 +1032,7 @@ def enabled(self) -> bool:
         return super().enabled()
 
     def explicit_group(self):
-        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(', ')}"
+        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         return (
@@ -1044,7 +1044,7 @@ def name(self):
         return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(', ')}"
+        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
 
     def get_tags(self):
         return [
@@ -1116,9 +1116,7 @@ def enabled(self) -> bool:
         return super().enabled()
 
     def explicit_group(self):
-        return (
-            f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}{self.cpu_count_str(', ')}"
-        )
+        return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         return ""
@@ -1127,7 +1125,7 @@ def name(self):
         return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}{self.cpu_count_str(', ')}"
+        return f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}{self.cpu_count_str(separator=',')}"
 
     def get_tags(self):
         return [runtime_to_tag_name(self.runtime), "micro", "latency", "submit"]

From 27da24e820141e63339a3aec753ae3aa29db3de1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Wed, 8 Oct 2025 11:34:48 +0000
Subject: [PATCH 04/11] Apply review comments changes

Set `cpuCounter` as a default profiler type.
---
 .github/workflows/sycl-linux-precommit.yml      | 1 -
 .github/workflows/sycl-nightly-benchmarking.yml | 1 -
 devops/scripts/benchmarks/main.py               | 2 +-
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml
index 0fa4d28e904bc..12986e98617fc 100644
--- a/.github/workflows/sycl-linux-precommit.yml
+++ b/.github/workflows/sycl-linux-precommit.yml
@@ -241,7 +241,6 @@ jobs:
       benchmark_upload_results: false
       benchmark_preset: 'Minimal'
       benchmark_dry_run: true
-      benchmark_profiler_type: 'cpuCounter'
       repo_ref: ${{ github.sha }}
       toolchain_artifact: ${{ needs.build.outputs.toolchain_artifact }}
       toolchain_artifact_filename: ${{ needs.build.outputs.toolchain_artifact_filename }}
diff --git a/.github/workflows/sycl-nightly-benchmarking.yml b/.github/workflows/sycl-nightly-benchmarking.yml
index df7b60928fdee..9d1098ee68b33 100644
--- a/.github/workflows/sycl-nightly-benchmarking.yml
+++ b/.github/workflows/sycl-nightly-benchmarking.yml
@@ -45,7 +45,6 @@ jobs:
       benchmark_upload_results: true
       benchmark_save_name: ${{ matrix.save_name }}
       benchmark_preset: ${{ matrix.preset }}
-      benchmark_profiler_type: cpuCounter
       repo_ref: ${{ matrix.ref }}
       toolchain_artifact: ${{ needs.ubuntu2204_build.outputs.toolchain_artifact }}
       toolchain_artifact_filename: ${{ needs.ubuntu2204_build.outputs.toolchain_artifact_filename }}
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index 27e49c0d27f86..b22c79b2b14c7 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -695,7 +695,7 @@ def validate_and_parse_env_args(env_args):
         choices=["timer", "cpuCounter"],
         help="Set the profiler type for benchmarks. 'timer' measures execution time, "
         "'cpuCounter' measures CPU instruction count for supported benchmarks.",
-        default="timer",
+        default="cpuCounter",
     )
 
     args = parser.parse_args()

From d36416cbbba9263b4def0bddd5b6a5e822d4cd47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Wed, 8 Oct 2025 12:11:12 +0000
Subject: [PATCH 05/11] Remove previous way of naming in Result

---
 devops/scripts/benchmarks/benches/compute.py | 44 +++-----------------
 1 file changed, 5 insertions(+), 39 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index a74f6161eacd1..3b03cd2371083 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -419,16 +419,16 @@ def run(
         )
         parsed_results = self.parse_output(result)
         ret = []
-        for label, median, stddev, unit in parsed_results:
-            extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
+        for median, stddev in parsed_results:
+            unit = "instr" if self.profiler_type == PROFILERS.CPU_COUNTER else "μs"
             ret.append(
                 Result(
-                    label=self.name() + extra_label,
+                    label=self.name(),
                     value=median,
                     stddev=stddev,
                     command=command,
                     env=env_vars,
-                    unit=parse_unit_type(unit),
+                    unit=unit,
                     git_url=self.bench.git_url(),
                     git_hash=self.bench.git_hash(),
                 )
@@ -445,7 +445,6 @@ def parse_output(self, output):
             if data_row is None:
                 break
             try:
-                label = data_row[0]
                 mean = float(data_row[1])
                 median = float(data_row[2])
                 # compute benchmarks report stddev as %
@@ -453,8 +452,7 @@ def parse_output(self, output):
                 if not math.isfinite(stddev):
                     stddev = 0.0  # Default to 0.0 if stddev is invalid
 
-                unit = data_row[7]
-                results.append((label, median, stddev, unit))
+                results.append((median, stddev))
             except (ValueError, IndexError) as e:
                 raise ValueError(f"Error parsing output: {e}")
         if len(results) == 0:
@@ -590,22 +588,6 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--profilerType={self.profiler_type.value}",
         ]
 
-    def get_metadata(self) -> dict[str, BenchmarkMetadata]:
-        metadata_dict = super().get_metadata()
-
-        # Create CPU count variant with modified display name and explicit_group
-        cpu_count_name = self.name() + " CPU count"
-        cpu_count_metadata = copy.deepcopy(metadata_dict[self.name()])
-        cpu_count_display_name = self.display_name() + ", CPU count"
-        cpu_count_explicit_group = (
-            self.explicit_group() + ", CPU count" if self.explicit_group() else ""
-        )
-        cpu_count_metadata.display_name = cpu_count_display_name
-        cpu_count_metadata.explicit_group = cpu_count_explicit_group
-        metadata_dict[cpu_count_name] = cpu_count_metadata
-
-        return metadata_dict
-
 
 class ExecImmediateCopyQueue(ComputeBenchmark):
     def __init__(
@@ -1070,22 +1052,6 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--profilerType={self.profiler_type.value}",
         ]
 
-    def get_metadata(self) -> dict[str, BenchmarkMetadata]:
-        metadata_dict = super().get_metadata()
-
-        # Create CPU count variant with modified display name and explicit_group
-        cpu_count_name = self.name() + " CPU count"
-        cpu_count_metadata = copy.deepcopy(metadata_dict[self.name()])
-        cpu_count_display_name = self.display_name() + ", CPU count"
-        cpu_count_explicit_group = (
-            self.explicit_group() + ", CPU count" if self.explicit_group() else ""
-        )
-        cpu_count_metadata.display_name = cpu_count_display_name
-        cpu_count_metadata.explicit_group = cpu_count_explicit_group
-        metadata_dict[cpu_count_name] = cpu_count_metadata
-
-        return metadata_dict
-
 
 class UllsEmptyKernel(ComputeBenchmark):
     def __init__(

From 4ed0f639e258fbda5ee89ee84d2f76541de33a73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Wed, 8 Oct 2025 13:30:49 +0000
Subject: [PATCH 06/11] Remove unused parse_unit_type() method

---
 devops/scripts/benchmarks/benches/compute.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index 3b03cd2371083..d57c82923d533 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -294,14 +294,6 @@ def benchmarks(self) -> list[Benchmark]:
         return benches
 
 
-def parse_unit_type(compute_unit):
-    if "[count]" in compute_unit:
-        return "instr"
-    elif "[us]" in compute_unit:
-        return "μs"
-    return compute_unit.replace("[", "").replace("]", "")
-
-
 class ComputeBenchmark(Benchmark):
     def __init__(
         self,

From c7b1b2b84318f680ae0b34674ed47343d3edcaed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Thu, 9 Oct 2025 12:34:52 +0000
Subject: [PATCH 07/11] Revert adding profiler-type option

---
 .github/workflows/sycl-linux-run-tests.yml    |  8 ----
 .../workflows/sycl-ur-perf-benchmarking.yml   | 10 -----
 devops/actions/run-tests/benchmark/action.yml | 19 +--------
 devops/scripts/benchmarks/benches/compute.py  | 42 ++-----------------
 devops/scripts/benchmarks/main.py             |  9 ----
 devops/scripts/benchmarks/options.py          |  1 -
 6 files changed, 4 insertions(+), 85 deletions(-)

diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml
index b0947f6521350..de81af128d58d 100644
--- a/.github/workflows/sycl-linux-run-tests.yml
+++ b/.github/workflows/sycl-linux-run-tests.yml
@@ -140,13 +140,6 @@ on:
         type: string
         default: 'false'
         required: False
-      benchmark_profiler_type:
-        description: |
-          Type of profiler to use for benchmarks. Options are "timer" and
-          "cpuCounter". Default is "cpuCounter".
-        type: string
-        default: 'cpuCounter'
-        required: False
 
   workflow_dispatch:
     inputs:
@@ -366,7 +359,6 @@ jobs:
         preset: ${{ inputs.benchmark_preset }}
         dry_run: ${{ inputs.benchmark_dry_run }}
         build_ref: ${{ inputs.repo_ref }}
-        profiler_type: ${{ inputs.benchmark_profiler_type }}
       env:
         RUNNER_TAG: ${{ inputs.runner }}
         GITHUB_TOKEN: ${{ secrets.LLVM_SYCL_BENCHMARK_TOKEN }}
diff --git a/.github/workflows/sycl-ur-perf-benchmarking.yml b/.github/workflows/sycl-ur-perf-benchmarking.yml
index 11a36efa5b838..b5b14cf34ade1 100644
--- a/.github/workflows/sycl-ur-perf-benchmarking.yml
+++ b/.github/workflows/sycl-ur-perf-benchmarking.yml
@@ -61,15 +61,6 @@ on:
           - Normal
           - Test
         default: 'Minimal'  # Only compute-benchmarks
-      benchmark_profiler_type:
-        description: |
-          Type of profiler to use for benchmarks. Options are "timer" and
-          "cpuCounter". Default is "cpuCounter".
-        type: choice
-        options:
-          - timer
-          - cpuCounter
-        default: 'cpuCounter'
       pr_no:
         type: string
         description: |
@@ -201,7 +192,6 @@ jobs:
       benchmark_upload_results: ${{ inputs.upload_results }}
       benchmark_save_name: ${{ needs.sanitize_inputs.outputs.benchmark_save_name }}
       benchmark_preset: ${{ inputs.preset }}
-      benchmark_profiler_type: ${{ inputs.benchmark_profiler_type }}
       repo_ref: ${{ needs.sanitize_inputs.outputs.build_ref }}
       toolchain_artifact: ${{ needs.build_sycl.outputs.toolchain_artifact }}
       toolchain_artifact_filename: ${{ needs.build_sycl.outputs.toolchain_artifact_filename }}
diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml
index 3898137dfba8b..add36688f94b5 100644
--- a/devops/actions/run-tests/benchmark/action.yml
+++ b/devops/actions/run-tests/benchmark/action.yml
@@ -32,10 +32,6 @@ inputs:
   dry_run:
     type: string
     required: False
-  profiler_type:
-    type: string
-    required: False
-    default: "cpuCounter"  # Other option is "timer"
 
 runs:
   using: "composite"
@@ -45,7 +41,6 @@ runs:
     env:
       TARGET_DEVICE: ${{ inputs.target_devices }}
       PRESET: ${{ inputs.preset }}
-      PROFILER_TYPE: ${{ inputs.profiler_type }}
     run: |
       case "$RUNNER_TAG" in
         '["PVC_PERF"]' ) ;;
@@ -80,17 +75,6 @@ runs:
       python3 ./devops/scripts/benchmarks/presets.py query "$PRESET"
       [ "$?" -ne 0 ] && exit 1  # Stop workflow if invalid preset
       echo "PRESET=$PRESET" >> $GITHUB_ENV
-
-      # Validate profiler type input
-      case "$PROFILER_TYPE" in
-        "timer") PROFILER_TYPE="timer" ;;
-        "cpuCounter") PROFILER_TYPE="cpuCounter" ;;
-      *) 
-        echo "Invalid profiler type specified: $PROFILER_TYPE"
-        exit 1
-        ;;
-      esac
-      echo "PROFILER_TYPE=$PROFILER_TYPE" >> $GITHUB_ENV
   - name: Compute CPU core range to run benchmarks on
     shell: bash
     run: |
@@ -220,8 +204,7 @@ runs:
         --preset "$PRESET" \
         --timestamp-override "$SAVE_TIMESTAMP" \
         --detect-version sycl,compute_runtime \
-        --flamegraph inclusive \
-        --profiler-type "$PROFILER_TYPE"
+        --flamegraph inclusive
 
       echo "-----"
       python3 ./devops/scripts/benchmarks/compare.py to_hist \
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index d57c82923d533..3f0484669bb53 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -427,7 +427,7 @@ def run(
             )
         return ret
 
-    def parse_output(self, output):
+    def parse_output(self, output: str) -> list[tuple[float, float]]:
         csv_file = io.StringIO(output)
         reader = csv.reader(csv_file)
         next(reader, None)
@@ -496,16 +496,11 @@ def supported_runtimes(self) -> list[RUNTIMES]:
         return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
 
     def enabled(self) -> bool:
+        # This is a workaround for the BMG server where we have old results for self.KernelExecTime=20
+        # The benchmark instance gets created just to make metadata for these old results
         if not super().enabled():
             return False
 
-        if (
-            self.runtime in (RUNTIMES.SYCL, RUNTIMES.UR)
-        ) and options.profiler_type != self.profiler_type.value:
-            return False
-
-        # This is a workaround for the BMG server where we have old results for self.KernelExecTime=20
-        # The benchmark instance gets created just to make metadata for these old results
         device_arch = getattr(options, "device_architecture", "")
         if "bmg" in device_arch and self.KernelExecTime == 20:
             # Disable this benchmark for BMG server, just create metadata
@@ -600,11 +595,6 @@ def __init__(
             profiler_type=profiler_type,
         )
 
-    def enabled(self) -> bool:
-        if options.profiler_type != self.profiler_type.value:
-            return False
-        return super().enabled()
-
     def name(self):
         order = "in order" if self.ioq else "out of order"
         return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
@@ -655,11 +645,6 @@ def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
             profiler_type=profiler_type,
         )
 
-    def enabled(self) -> bool:
-        if options.profiler_type != self.profiler_type.value:
-            return False
-        return super().enabled()
-
     def name(self):
         return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
@@ -702,11 +687,6 @@ def __init__(self, bench, source, destination, size, profiler_type):
             bench, "memory_benchmark_sycl", "QueueMemcpy", profiler_type=profiler_type
         )
 
-    def enabled(self) -> bool:
-        if options.profiler_type != self.profiler_type.value:
-            return False
-        return super().enabled()
-
     def name(self):
         return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
 
@@ -997,14 +977,6 @@ def __init__(
     def supported_runtimes(self) -> list[RUNTIMES]:
         return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
 
-    def enabled(self) -> bool:
-        if (
-            self.runtime == RUNTIMES.SYCL
-            and options.profiler_type != self.profiler_type.value
-        ):
-            return False
-        return super().enabled()
-
     def explicit_group(self):
         return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
 
@@ -1065,14 +1037,6 @@ def __init__(
     def supported_runtimes(self) -> list[RUNTIMES]:
         return [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
 
-    def enabled(self) -> bool:
-        if (
-            self.runtime == RUNTIMES.SYCL
-            and options.profiler_type != self.profiler_type.value
-        ):
-            return False
-        return super().enabled()
-
     def explicit_group(self):
         return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}{self.cpu_count_str(separator=',')}"
 
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index b22c79b2b14c7..7bf6919915cc4 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -689,14 +689,6 @@ def validate_and_parse_env_args(env_args):
         help="Set the logging level",
         default="info",
     )
-    parser.add_argument(
-        "--profiler-type",
-        type=str,
-        choices=["timer", "cpuCounter"],
-        help="Set the profiler type for benchmarks. 'timer' measures execution time, "
-        "'cpuCounter' measures CPU instruction count for supported benchmarks.",
-        default="cpuCounter",
-    )
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
@@ -728,7 +720,6 @@ def validate_and_parse_env_args(env_args):
     options.build_jobs = args.build_jobs
     options.hip_arch = args.hip_arch
     options.flamegraph = args.flamegraph is not None
-    options.profiler_type = args.profiler_type
 
     # Initialize logger with command line arguments
     log.initialize(args.verbose, args.log_level)
diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py
index 1d1cd889510fc..f17d96313badb 100644
--- a/devops/scripts/benchmarks/options.py
+++ b/devops/scripts/benchmarks/options.py
@@ -74,7 +74,6 @@ class Options:
     exit_on_failure: bool = False
     flamegraph: bool = False
     unitrace: bool = False
-    profiler_type: str = "timer"
 
     # Options intended for CI:
 

From 74552d90ad262b942a097009580e4fb66db5b453 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Thu, 9 Oct 2025 19:02:29 +0000
Subject: [PATCH 08/11] Add EmulateGraph parameter to SubmitGraph benches

---
 devops/scripts/benchmarks/benches/compute.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index 3f0484669bb53..aa53b5969ae25 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -61,8 +61,8 @@ def git_url(self) -> str:
         return "https://github.com/intel/compute-benchmarks.git"
 
     def git_hash(self) -> str:
-        # Sep 26, 2025
-        return "db0d6708a37de69d844d6521ca06ef35b1cc55af"
+        # Oct 9, 2025
+        return "32805b4b6f8dafb4a97f21c4c85bb2f6963f8dbb"
 
     def setup(self) -> None:
         if options.sycl is None:
@@ -203,6 +203,9 @@ def benchmarks(self) -> list[Benchmark]:
             measure_completion_time,
             use_events,
         ) in submit_graph_params:
+            emulate_graphs = (
+                0 if runtime in (RUNTIMES.SYCL, RUNTIMES.SYCL_PREVIEW) else 1
+            )
             benches.append(
                 GraphApiSubmitGraph(
                     self,
@@ -211,6 +214,7 @@ def benchmarks(self) -> list[Benchmark]:
                     num_kernels,
                     measure_completion_time,
                     use_events,
+                    emulate_graphs,
                     useHostTasks=0,
                 )
             )
@@ -224,6 +228,7 @@ def benchmarks(self) -> list[Benchmark]:
                         num_kernels,
                         measure_completion_time,
                         use_events,
+                        emulate_graphs,
                         useHostTasks=0,
                         profiler_type=PROFILERS.CPU_COUNTER,
                     )
@@ -949,6 +954,7 @@ def __init__(
         numKernels,
         measureCompletionTime,
         useEvents,
+        emulate_graphs,
         useHostTasks,
         profiler_type=PROFILERS.TIMER,
     ):
@@ -957,12 +963,14 @@ def __init__(
         self.measureCompletionTime = measureCompletionTime
         self.useEvents = useEvents
         self.useHostTasks = useHostTasks
+        self.emulateGraphs = emulate_graphs
         self.ioq_str = "in order" if self.inOrderQueue else "out of order"
         self.measure_str = (
             " with measure completion" if self.measureCompletionTime else ""
         )
         self.use_events_str = f" with events" if self.useEvents else ""
         self.host_tasks_str = f" use host tasks" if self.useHostTasks else ""
+        self.emulate_graphs_str = f" emulate graphs" if self.emulateGraphs else ""
         # iterations per bin_args: --iterations=10000
         self.iterations_regular = 10000
         self.iterations_trace = 10
@@ -978,7 +986,7 @@ def supported_runtimes(self) -> list[RUNTIMES]:
         return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
 
     def explicit_group(self):
-        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
+        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}{self.emulate_graphs_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         return (
@@ -987,10 +995,10 @@ def description(self) -> str:
         )
 
     def name(self):
-        return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}{self.cpu_count_str()}"
+        return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str}{self.emulate_graphs_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
+        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}{self.emulate_graphs_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
 
     def get_tags(self):
         return [
@@ -1014,6 +1022,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             "--UseExplicit=0",
             f"--UseHostTasks={self.useHostTasks}",
             f"--profilerType={self.profiler_type.value}",
+            f"--EmulateGraphs={self.emulateGraphs}",
         ]
 
 

From 1aee5e0d33107d28a1582c25b523c6a8ffeb690b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Tue, 14 Oct 2025 07:50:05 +0000
Subject: [PATCH 09/11] Don't create new names

Avoid creating new groups/names as each runtime has only one valid EmulateGraphs value.
---
 devops/scripts/benchmarks/benches/compute.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index bafde23056c1c..247c824365c62 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -974,7 +974,6 @@ def __init__(
         )
         self.use_events_str = f" with events" if self.useEvents else ""
         self.host_tasks_str = f" use host tasks" if self.useHostTasks else ""
-        self.emulate_graphs_str = f" emulate graphs" if self.emulateGraphs else ""
         # iterations per bin_args: --iterations=10000
         self.iterations_regular = 10000
         self.iterations_trace = 10
@@ -990,7 +989,7 @@ def supported_runtimes(self) -> list[RUNTIMES]:
         return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
 
     def explicit_group(self):
-        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}{self.emulate_graphs_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
+        return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
 
     def description(self) -> str:
         return (
@@ -999,10 +998,10 @@ def description(self) -> str:
         )
 
     def name(self):
-        return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str}{self.emulate_graphs_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}{self.cpu_count_str()}"
+        return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}{self.cpu_count_str()}"
 
     def display_name(self) -> str:
-        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}{self.emulate_graphs_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
+        return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
 
     def get_tags(self):
         return [

From d5abaea32df41dccb59dcb02889a147578dec80d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Thu, 16 Oct 2025 08:13:22 +0000
Subject: [PATCH 10/11] Add comment on emulated graphs

---
 devops/scripts/benchmarks/benches/compute.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index 247c824365c62..db043f1a624e2 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -203,6 +203,8 @@ def benchmarks(self) -> list[Benchmark]:
             measure_completion_time,
             use_events,
         ) in submit_graph_params:
+            # Non-sycl runtimes have to be run with emulated graphs,
+            # see: https://github.com/intel/compute-benchmarks/commit/d81d5d602739482b9070c872a28c0b5ebb41de70
             emulate_graphs = (
                 0 if runtime in (RUNTIMES.SYCL, RUNTIMES.SYCL_PREVIEW) else 1
             )

From 08a5914e208baad2fab77ccb332f91b18c7a3d8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Thu, 16 Oct 2025 08:26:07 +0000
Subject: [PATCH 11/11] Add cpu counter runs for syclpreview variant of
 SubmitKernel benchmarks

---
 devops/scripts/benchmarks/benches/compute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
index db043f1a624e2..a308d9cc19dc8 100644
--- a/devops/scripts/benchmarks/benches/compute.py
+++ b/devops/scripts/benchmarks/benches/compute.py
@@ -152,7 +152,7 @@ def benchmarks(self) -> list[Benchmark]:
                         kernel_exec_time,
                     )
                 )
-                if runtime in (RUNTIMES.SYCL, RUNTIMES.UR):
+                if runtime in (RUNTIMES.SYCL, RUNTIMES.SYCL_PREVIEW, RUNTIMES.UR):
                     # Create CPU count variant
                     benches.append(
                         SubmitKernel(