Skip to content

Commit 93a515d

Browse files
authored
[Benchmarks] Use combo profiler in UR SubmitKernel scenarios (#20295)
Use the Combo Profiler in all supported Compute Benchmark scenarios. ~~Introduce the `--profiler-type` parameter for picking one profiler each run. This effectively disables running the same scenarios two times: with time measurement and with CPU instructions retired counter.~~ ~~Leaving by default `CPU counter` scenarios enabled in CI. This will result in less scenarios being ran as we won't run time-measuring scenarios where CPU counter is available. In the dashboard, only historic results will be drawn, no new runs in such cases.~~ The `CPU count` groups and names suffix can now be safely determined beforehand, without having to parse run logs. Also, a Compute Benchmarks version bump is made for these changes.
1 parent 1f89d58 commit 93a515d

File tree

1 file changed

+51
-84
lines changed

1 file changed

+51
-84
lines changed

devops/scripts/benchmarks/benches/compute.py

Lines changed: 51 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ def git_url(self) -> str:
6161
return "https://github.com/intel/compute-benchmarks.git"
6262

6363
def git_hash(self) -> str:
64-
# Sep 25, 2025
65-
return "7ba2e629404e34c635a46f28550a0952717d120f"
64+
# Oct 9, 2025
65+
return "32805b4b6f8dafb4a97f21c4c85bb2f6963f8dbb"
6666

6767
def setup(self) -> None:
6868
if options.sycl is None:
@@ -152,7 +152,7 @@ def benchmarks(self) -> list[Benchmark]:
152152
kernel_exec_time,
153153
)
154154
)
155-
if runtime == RUNTIMES.SYCL:
155+
if runtime in (RUNTIMES.SYCL, RUNTIMES.SYCL_PREVIEW, RUNTIMES.UR):
156156
# Create CPU count variant
157157
benches.append(
158158
SubmitKernel(
@@ -203,6 +203,11 @@ def benchmarks(self) -> list[Benchmark]:
203203
measure_completion_time,
204204
use_events,
205205
) in submit_graph_params:
206+
# Non-sycl runtimes have to be run with emulated graphs,
207+
# see: https://github.com/intel/compute-benchmarks/commit/d81d5d602739482b9070c872a28c0b5ebb41de70
208+
emulate_graphs = (
209+
0 if runtime in (RUNTIMES.SYCL, RUNTIMES.SYCL_PREVIEW) else 1
210+
)
206211
benches.append(
207212
GraphApiSubmitGraph(
208213
self,
@@ -211,6 +216,7 @@ def benchmarks(self) -> list[Benchmark]:
211216
num_kernels,
212217
measure_completion_time,
213218
use_events,
219+
emulate_graphs,
214220
useHostTasks=0,
215221
)
216222
)
@@ -224,6 +230,7 @@ def benchmarks(self) -> list[Benchmark]:
224230
num_kernels,
225231
measure_completion_time,
226232
use_events,
233+
emulate_graphs,
227234
useHostTasks=0,
228235
profiler_type=PROFILERS.CPU_COUNTER,
229236
)
@@ -294,14 +301,6 @@ def benchmarks(self) -> list[Benchmark]:
294301
return benches
295302

296303

297-
def parse_unit_type(compute_unit):
298-
if "[count]" in compute_unit:
299-
return "instr"
300-
elif "[us]" in compute_unit:
301-
return "μs"
302-
return compute_unit.replace("[", "").replace("]", "")
303-
304-
305304
class ComputeBenchmark(Benchmark):
306305
def __init__(
307306
self,
@@ -330,6 +329,17 @@ def benchmark_bin(self) -> Path:
330329
"""Returns the path to the benchmark binary"""
331330
return self.bench.project.build_dir / "bin" / self.bench_name
332331

332+
def cpu_count_str(self, separator: str = "") -> str:
333+
# Note: SYCL CI currently relies on this "CPU count" value.
334+
# Please update /devops/scripts/benchmarks/compare.py if this value
335+
# is changed. See compare.py usage (w.r.t. --regression-filter) in
336+
# /devops/actions/run-tests/benchmarks/action.yml.
337+
return (
338+
f"{separator} CPU count"
339+
if self.profiler_type == PROFILERS.CPU_COUNTER
340+
else ""
341+
)
342+
333343
def get_iters(self, run_trace: TracingType):
334344
"""Returns the number of iterations to run for the given tracing type."""
335345
return (
@@ -412,27 +422,23 @@ def run(
412422
)
413423
parsed_results = self.parse_output(result)
414424
ret = []
415-
for label, median, stddev, unit in parsed_results:
416-
extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
417-
# Note: SYCL CI currently relies on this "CPU count" value.
418-
# Please update /devops/scripts/benchmarks/compare.py if this value
419-
# is changed. See compare.py usage (w.r.t. --regression-filter) in
420-
# /devops/actions/run-tests/benchmarks/action.yml.
425+
for median, stddev in parsed_results:
426+
unit = "instr" if self.profiler_type == PROFILERS.CPU_COUNTER else "μs"
421427
ret.append(
422428
Result(
423-
label=self.name() + extra_label,
429+
label=self.name(),
424430
value=median,
425431
stddev=stddev,
426432
command=command,
427433
env=env_vars,
428-
unit=parse_unit_type(unit),
434+
unit=unit,
429435
git_url=self.bench.git_url(),
430436
git_hash=self.bench.git_hash(),
431437
)
432438
)
433439
return ret
434440

435-
def parse_output(self, output):
441+
def parse_output(self, output: str) -> list[tuple[float, float]]:
436442
csv_file = io.StringIO(output)
437443
reader = csv.reader(csv_file)
438444
next(reader, None)
@@ -442,16 +448,14 @@ def parse_output(self, output):
442448
if data_row is None:
443449
break
444450
try:
445-
label = data_row[0]
446451
mean = float(data_row[1])
447452
median = float(data_row[2])
448453
# compute benchmarks report stddev as %
449454
stddev = mean * (float(data_row[3].strip("%")) / 100.0)
450455
if not math.isfinite(stddev):
451456
stddev = 0.0 # Default to 0.0 if stddev is invalid
452457

453-
unit = data_row[7]
454-
results.append((label, median, stddev, unit))
458+
results.append((median, stddev))
455459
except (ValueError, IndexError) as e:
456460
raise ValueError(f"Error parsing output: {e}")
457461
if len(results) == 0:
@@ -532,7 +536,7 @@ def name(self):
532536
f" KernelExecTime={self.KernelExecTime}" if self.KernelExecTime != 1 else ""
533537
)
534538

535-
return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}"
539+
return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str()}"
536540

537541
def display_name(self) -> str:
538542
order = "in order" if self.ioq else "out of order"
@@ -544,7 +548,7 @@ def display_name(self) -> str:
544548
if self.KernelExecTime != 1:
545549
info.append(f"KernelExecTime={self.KernelExecTime}")
546550
additional_info = f" {' '.join(info)}" if info else ""
547-
return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}"
551+
return f"{self.runtime.value.upper()} SubmitKernel {order}{additional_info}, NumKernels {self.NumKernels}{self.cpu_count_str(separator=',')}"
548552

549553
def explicit_group(self):
550554
order = "in order" if self.ioq else "out of order"
@@ -553,7 +557,7 @@ def explicit_group(self):
553557

554558
kernel_exec_time_str = f" long kernel" if self.KernelExecTime != 1 else ""
555559

556-
return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}"
560+
return f"SubmitKernel {order}{completion_str}{events_str}{kernel_exec_time_str}{self.cpu_count_str(separator=',')}"
557561

558562
def description(self) -> str:
559563
order = "in-order" if self.ioq else "out-of-order"
@@ -571,34 +575,16 @@ def range(self) -> tuple[float, float]:
571575

572576
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
573577
iters = self.get_iters(run_trace)
574-
bin_args = [
578+
return [
575579
f"--iterations={iters}",
576580
f"--Ioq={self.ioq}",
577581
f"--MeasureCompletion={self.MeasureCompletion}",
578582
"--Profiling=0",
579583
f"--NumKernels={self.NumKernels}",
580584
f"--KernelExecTime={self.KernelExecTime}",
581585
f"--UseEvents={self.UseEvents}",
586+
f"--profilerType={self.profiler_type.value}",
582587
]
583-
if self.runtime == RUNTIMES.SYCL:
584-
bin_args.append(f"--profilerType={self.profiler_type.value}")
585-
return bin_args
586-
587-
def get_metadata(self) -> dict[str, BenchmarkMetadata]:
588-
metadata_dict = super().get_metadata()
589-
590-
# Create CPU count variant with modified display name and explicit_group
591-
cpu_count_name = self.name() + " CPU count"
592-
cpu_count_metadata = copy.deepcopy(metadata_dict[self.name()])
593-
cpu_count_display_name = self.display_name() + ", CPU count"
594-
cpu_count_explicit_group = (
595-
self.explicit_group() + ", CPU count" if self.explicit_group() else ""
596-
)
597-
cpu_count_metadata.display_name = cpu_count_display_name
598-
cpu_count_metadata.explicit_group = cpu_count_explicit_group
599-
metadata_dict[cpu_count_name] = cpu_count_metadata
600-
601-
return metadata_dict
602588

603589

604590
class ExecImmediateCopyQueue(ComputeBenchmark):
@@ -622,11 +608,11 @@ def __init__(
622608

623609
def name(self):
624610
order = "in order" if self.ioq else "out of order"
625-
return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
611+
return f"api_overhead_benchmark_sycl ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
626612

627613
def display_name(self) -> str:
628614
order = "in order" if self.ioq else "out of order"
629-
return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}"
615+
return f"SYCL ExecImmediateCopyQueue {order} from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(separator=',')}"
630616

631617
def description(self) -> str:
632618
order = "in-order" if self.ioq else "out-of-order"
@@ -671,10 +657,10 @@ def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
671657
)
672658

673659
def name(self):
674-
return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
660+
return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
675661

676662
def display_name(self) -> str:
677-
return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
663+
return f"SYCL QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(separator=',')}"
678664

679665
def description(self) -> str:
680666
operation = "copy-only" if self.isCopyOnly else "copy and command submission"
@@ -713,10 +699,10 @@ def __init__(self, bench, source, destination, size, profiler_type):
713699
)
714700

715701
def name(self):
716-
return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
702+
return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str()}"
717703

718704
def display_name(self) -> str:
719-
return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
705+
return f"SYCL QueueMemcpy from {self.source} to {self.destination}, size {self.size}{self.cpu_count_str(separator=',')}"
720706

721707
def description(self) -> str:
722708
return (
@@ -974,6 +960,7 @@ def __init__(
974960
numKernels,
975961
measureCompletionTime,
976962
useEvents,
963+
emulate_graphs,
977964
useHostTasks,
978965
profiler_type=PROFILERS.TIMER,
979966
):
@@ -982,6 +969,7 @@ def __init__(
982969
self.measureCompletionTime = measureCompletionTime
983970
self.useEvents = useEvents
984971
self.useHostTasks = useHostTasks
972+
self.emulateGraphs = emulate_graphs
985973
self.ioq_str = "in order" if self.inOrderQueue else "out of order"
986974
self.measure_str = (
987975
" with measure completion" if self.measureCompletionTime else ""
@@ -1003,7 +991,7 @@ def supported_runtimes(self) -> list[RUNTIMES]:
1003991
return super().supported_runtimes() + [RUNTIMES.SYCL_PREVIEW]
1004992

1005993
def explicit_group(self):
1006-
return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels"
994+
return f"SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
1007995

1008996
def description(self) -> str:
1009997
return (
@@ -1012,10 +1000,10 @@ def description(self) -> str:
10121000
)
10131001

10141002
def name(self):
1015-
return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}"
1003+
return f"graph_api_benchmark_{self.runtime.value} SubmitGraph{self.use_events_str}{self.host_tasks_str} numKernels:{self.numKernels} ioq {self.inOrderQueue} measureCompletion {self.measureCompletionTime}{self.cpu_count_str()}"
10161004

10171005
def display_name(self) -> str:
1018-
return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels"
1006+
return f"{self.runtime.value.upper()} SubmitGraph {self.ioq_str}{self.measure_str}{self.use_events_str}{self.host_tasks_str}, {self.numKernels} kernels{self.cpu_count_str(separator=',')}"
10191007

10201008
def get_tags(self):
10211009
return [
@@ -1028,7 +1016,7 @@ def get_tags(self):
10281016

10291017
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
10301018
iters = self.get_iters(run_trace)
1031-
bin_args = [
1019+
return [
10321020
f"--iterations={iters}",
10331021
f"--NumKernels={self.numKernels}",
10341022
f"--MeasureCompletionTime={self.measureCompletionTime}",
@@ -1038,26 +1026,9 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
10381026
f"--UseEvents={self.useEvents}",
10391027
"--UseExplicit=0",
10401028
f"--UseHostTasks={self.useHostTasks}",
1029+
f"--profilerType={self.profiler_type.value}",
1030+
f"--EmulateGraphs={self.emulateGraphs}",
10411031
]
1042-
if self.runtime == RUNTIMES.SYCL:
1043-
bin_args.append(f"--profilerType={self.profiler_type.value}")
1044-
return bin_args
1045-
1046-
def get_metadata(self) -> dict[str, BenchmarkMetadata]:
1047-
metadata_dict = super().get_metadata()
1048-
1049-
# Create CPU count variant with modified display name and explicit_group
1050-
cpu_count_name = self.name() + " CPU count"
1051-
cpu_count_metadata = copy.deepcopy(metadata_dict[self.name()])
1052-
cpu_count_display_name = self.display_name() + ", CPU count"
1053-
cpu_count_explicit_group = (
1054-
self.explicit_group() + ", CPU count" if self.explicit_group() else ""
1055-
)
1056-
cpu_count_metadata.display_name = cpu_count_display_name
1057-
cpu_count_metadata.explicit_group = cpu_count_explicit_group
1058-
metadata_dict[cpu_count_name] = cpu_count_metadata
1059-
1060-
return metadata_dict
10611032

10621033

10631034
class UllsEmptyKernel(ComputeBenchmark):
@@ -1081,32 +1052,28 @@ def supported_runtimes(self) -> list[RUNTIMES]:
10811052
return [RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]
10821053

10831054
def explicit_group(self):
1084-
return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}"
1055+
return f"EmptyKernel, wgc: {self.wgc}, wgs: {self.wgs}{self.cpu_count_str(separator=',')}"
10851056

10861057
def description(self) -> str:
10871058
return ""
10881059

10891060
def name(self):
1090-
return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}"
1061+
return f"ulls_benchmark_{self.runtime.value} EmptyKernel wgc:{self.wgc}, wgs:{self.wgs}{self.cpu_count_str()}"
10911062

10921063
def display_name(self) -> str:
1093-
return (
1094-
f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}"
1095-
)
1064+
return f"{self.runtime.value.upper()} EmptyKernel, wgc {self.wgc}, wgs {self.wgs}{self.cpu_count_str(separator=',')}"
10961065

10971066
def get_tags(self):
10981067
return [runtime_to_tag_name(self.runtime), "micro", "latency", "submit"]
10991068

11001069
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
11011070
iters = self.get_iters(run_trace)
1102-
bin_args = [
1071+
return [
11031072
f"--iterations={iters}",
11041073
f"--wgs={self.wgs}",
11051074
f"--wgc={self.wgc}",
1075+
f"--profilerType={self.profiler_type.value}",
11061076
]
1107-
if self.runtime == RUNTIMES.SYCL:
1108-
bin_args.append(f"--profilerType={self.profiler_type.value}")
1109-
return bin_args
11101077

11111078

11121079
class UllsKernelSwitch(ComputeBenchmark):

0 commit comments

Comments
 (0)