[Benchmarks] Bump Compute Benchmarks (#19857)

PatKamin · web-flow · commit 0eb7c7321f3d · 2025-09-12T08:16:17.000-07:00
- stabilize results with a small sleep between binary runs
- add combo profiler functionality which allows for choosing between
time and cpu count measurement
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
@@ -54,7 +54,7 @@ def git_url(self) -> str:
         return "https://github.com/intel/compute-benchmarks.git"
 
     def git_hash(self) -> str:
-        return "c9e135d4f26dd6badd83009f92f25d6285fc1e21"
+        return "4995560017559849a519e58978a0afdd55903e15"
 
     def setup(self) -> None:
         if options.sycl is None:
@@ -177,6 +177,9 @@ def benchmarks(self) -> list[Benchmark]:
         # See SubmitKernel.enabled()
         long_kernel_exec_time_ooo = [20, 200]
 
+        # The Combo Profiler is available only for selected sycl benchmarks
+        profiler_types = ["timer", "cpuCounter"]
+
         for runtime in list(RUNTIMES):
             # Add SubmitKernel benchmarks using loops
             for in_order_queue in [0, 1]:
@@ -188,16 +191,18 @@ def benchmarks(self) -> list[Benchmark]:
                             else long_kernel_exec_time_ooo
                         )
                         for kernel_exec_time in [1, *long_kernel_exec_time]:
-                            benches.append(
-                                SubmitKernel(
-                                    self,
-                                    runtime,
-                                    in_order_queue,
-                                    measure_completion,
-                                    use_events,
-                                    kernel_exec_time,
+                            for profiler_type in profiler_types:
+                                benches.append(
+                                    SubmitKernel(
+                                        self,
+                                        runtime,
+                                        in_order_queue,
+                                        measure_completion,
+                                        use_events,
+                                        kernel_exec_time,
+                                        profiler_type,
+                                    )
                                 )
-                            )
 
             # Add SinKernelGraph benchmarks
             for with_graphs in [0, 1]:
@@ -207,51 +212,69 @@ def benchmarks(self) -> list[Benchmark]:
                     )
 
             # Add ULLS benchmarks
-            benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
+            for profiler_type in profiler_types:
+                benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type))
             benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
 
             # Add GraphApiSubmitGraph benchmarks
             for in_order_queue in [0, 1]:
-                benches.append(
-                    GraphApiSubmitGraph(
-                        self,
-                        runtime,
-                        in_order_queue,
-                        self.submit_graph_num_kernels[-1],
-                        0,
-                        useEvents=0,
-                        useHostTasks=1,
+                for profiler_type in profiler_types:
+                    benches.append(
+                        GraphApiSubmitGraph(
+                            self,
+                            runtime,
+                            in_order_queue,
+                            self.submit_graph_num_kernels[-1],
+                            0,
+                            profiler_type,
+                            useEvents=0,
+                            useHostTasks=1,
+                        )
                     )
-                )
                 for num_kernels in self.submit_graph_num_kernels:
                     for measure_completion_time in [0, 1]:
                         for use_events in [0, 1]:
-                            benches.append(
-                                GraphApiSubmitGraph(
-                                    self,
-                                    runtime,
-                                    in_order_queue,
-                                    num_kernels,
-                                    measure_completion_time,
-                                    use_events,
-                                    useHostTasks=0,
+                            for profiler_type in profiler_types:
+                                benches.append(
+                                    GraphApiSubmitGraph(
+                                        self,
+                                        runtime,
+                                        in_order_queue,
+                                        num_kernels,
+                                        measure_completion_time,
+                                        profiler_type,
+                                        use_events,
+                                        useHostTasks=0,
+                                    )
                                 )
-                            )
 
         # Add other benchmarks
         benches += [
-            QueueInOrderMemcpy(self, 0, "Device", "Device", 1024),
-            QueueInOrderMemcpy(self, 0, "Host", "Device", 1024),
-            QueueMemcpy(self, "Device", "Device", 1024),
             StreamMemory(self, "Triad", 10 * 1024, "Device"),
-            ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
-            ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
             VectorSum(self),
             GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Gromacs"),
             GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Gromacs"),
             GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"),
             GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"),
         ]
+        for profiler_type in profiler_types:
+            benches.append(
+                QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type)
+            )
+            benches.append(
+                QueueInOrderMemcpy(self, 0, "Host", "Device", 1024, profiler_type)
+            )
+            benches.append(QueueMemcpy(self, "Device", "Device", 1024, profiler_type))
+            benches.append(
+                ExecImmediateCopyQueue(
+                    self, 0, 1, "Device", "Device", 1024, profiler_type
+                )
+            )
+            benches.append(
+                ExecImmediateCopyQueue(
+                    self, 1, 1, "Device", "Host", 1024, profiler_type
+                )
+            )
 
         # Add UR-specific benchmarks
         benches += [
@@ -299,12 +322,15 @@ def parse_unit_type(compute_unit):
 
 
 class ComputeBenchmark(Benchmark):
-    def __init__(self, bench, name, test, runtime: RUNTIMES = None):
+    def __init__(
+        self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = ""
+    ):
         super().__init__(bench.directory, bench)
         self.bench = bench
         self.bench_name = name
         self.test = test
         self.runtime = runtime
+        self.profiler_type = profiler_type
         # Mandatory per-benchmark iteration counts.
         # Subclasses MUST set both `self.iterations_regular` and
         # `self.iterations_trace` (positive ints) in their __init__ before
@@ -465,6 +491,7 @@ def __init__(
         MeasureCompletion=0,
         UseEvents=0,
         KernelExecTime=1,
+        profiler_type="",
     ):
         self.ioq = ioq
         self.MeasureCompletion = MeasureCompletion
@@ -475,7 +502,11 @@ def __init__(
         self.iterations_regular = 100000
         self.iterations_trace = 10
         super().__init__(
-            bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel", runtime
+            bench,
+            f"api_overhead_benchmark_{runtime.value}",
+            "SubmitKernel",
+            runtime,
+            profiler_type,
         )
 
     def supported_runtimes(self) -> list[RUNTIMES]:
@@ -486,9 +517,14 @@ def enabled(self) -> bool:
         # The benchmark instance gets created just to make metadata for these old results
         if not super().enabled():
             return False
-        if "bmg" in options.device_architecture and self.KernelExecTime == 20:
+
+        device_arch = getattr(options, "device_architecture", "")
+        if "bmg" in device_arch and self.KernelExecTime == 20:
             # Disable this benchmark for BMG server, just create metadata
             return False
+        if "bmg" not in device_arch and self.KernelExecTime == 200:
+            # Disable KernelExecTime=200 for non-BMG systems, just create metadata
+            return False
         return True
 
     def get_tags(self):
@@ -545,7 +581,7 @@ def range(self) -> tuple[float, float]:
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        return [
+        bin_args = [
             f"--iterations={iters}",
             f"--Ioq={self.ioq}",
             f"--MeasureCompletion={self.MeasureCompletion}",
@@ -554,6 +590,9 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--KernelExecTime={self.KernelExecTime}",
             f"--UseEvents={self.UseEvents}",
         ]
+        if self.runtime == RUNTIMES.SYCL:
+            bin_args.append(f"--profilerType={self.profiler_type}")
+        return bin_args
 
     def get_metadata(self) -> dict[str, BenchmarkMetadata]:
         metadata_dict = super().get_metadata()
@@ -573,7 +612,9 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
 
 
 class ExecImmediateCopyQueue(ComputeBenchmark):
-    def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
+    def __init__(
+        self, bench, ioq, isCopyOnly, source, destination, size, profiler_type
+    ):
         self.ioq = ioq
         self.isCopyOnly = isCopyOnly
         self.source = source
@@ -582,7 +623,12 @@ def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
         # iterations per bin_args: --iterations=100000
         self.iterations_regular = 100000
         self.iterations_trace = 10
-        super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue")
+        super().__init__(
+            bench,
+            "api_overhead_benchmark_sycl",
+            "ExecImmediateCopyQueue",
+            profiler_type=profiler_type,
+        )
 
     def name(self):
         order = "in order" if self.ioq else "out of order"
@@ -614,19 +660,25 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--dst={self.destination}",
             f"--size={self.size}",
             "--withCopyOffload=0",
+            f"--profilerType={self.profiler_type}",
         ]
 
 
 class QueueInOrderMemcpy(ComputeBenchmark):
-    def __init__(self, bench, isCopyOnly, source, destination, size):
+    def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
         self.isCopyOnly = isCopyOnly
         self.source = source
         self.destination = destination
         self.size = size
         # iterations per bin_args: --iterations=10000
         self.iterations_regular = 10000
         self.iterations_trace = 10
-        super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy")
+        super().__init__(
+            bench,
+            "memory_benchmark_sycl",
+            "QueueInOrderMemcpy",
+            profiler_type=profiler_type,
+        )
 
     def name(self):
         return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
@@ -654,18 +706,21 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--size={self.size}",
             "--count=100",
             "--withCopyOffload=0",
+            f"--profilerType={self.profiler_type}",
         ]
 
 
 class QueueMemcpy(ComputeBenchmark):
-    def __init__(self, bench, source, destination, size):
+    def __init__(self, bench, source, destination, size, profiler_type):
         self.source = source
         self.destination = destination
         self.size = size
         # iterations per bin_args: --iterations=10000
         self.iterations_regular = 10000
         self.iterations_trace = 10
-        super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy")
+        super().__init__(
+            bench, "memory_benchmark_sycl", "QueueMemcpy", profiler_type=profiler_type
+        )
 
     def name(self):
         return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
@@ -689,6 +744,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--sourcePlacement={self.source}",
             f"--destinationPlacement={self.destination}",
             f"--size={self.size}",
+            f"--profilerType={self.profiler_type}",
         ]
 
 
@@ -927,6 +983,7 @@ def __init__(
         inOrderQueue,
         numKernels,
         measureCompletionTime,
+        profiler_type,
         useEvents,
         useHostTasks,
     ):
@@ -945,7 +1002,11 @@ def __init__(
         self.iterations_regular = 10000
         self.iterations_trace = 10
         super().__init__(
-            bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph", runtime
+            bench,
+            f"graph_api_benchmark_{runtime.value}",
+            "SubmitGraph",
+            runtime,
+            profiler_type,
         )
 
     def explicit_group(self):
@@ -974,7 +1035,7 @@ def get_tags(self):
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        return [
+        bin_args = [
             f"--iterations={iters}",
             f"--NumKernels={self.numKernels}",
             f"--MeasureCompletionTime={self.measureCompletionTime}",
@@ -985,17 +1046,24 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             "--UseExplicit=0",
             f"--UseHostTasks={self.useHostTasks}",
         ]
+        if self.runtime == RUNTIMES.SYCL:
+            bin_args.append(f"--profilerType={self.profiler_type}")
+        return bin_args
 
 
 class UllsEmptyKernel(ComputeBenchmark):
-    def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
+    def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type):
         self.wgc = wgc
         self.wgs = wgs
         # iterations per bin_args: --iterations=10000
         self.iterations_regular = 10000
         self.iterations_trace = 10
         super().__init__(
-            bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel", runtime
+            bench,
+            f"ulls_benchmark_{runtime.value}",
+            "EmptyKernel",
+            runtime,
+            profiler_type,
         )
 
     def supported_runtimes(self) -> list[RUNTIMES]:
@@ -1020,11 +1088,14 @@ def get_tags(self):
 
     def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
         iters = self.get_iters(run_trace)
-        return [
+        bin_args = [
             f"--iterations={iters}",
             f"--wgs={self.wgs}",
             f"--wgc={self.wgc}",
         ]
+        if self.runtime == RUNTIMES.SYCL:
+            bin_args.append(f"--profilerType={self.profiler_type}")
+        return bin_args
 
 
 class UllsKernelSwitch(ComputeBenchmark):