Skip to content

Commit 2c6c0c6

Browse files
committed
[Benchmarks] Bump Compute Benchmarks
- stabilize results with a small sleep between binary runs - add combo profiler functionality to allow for choosing between time and cpu count measurement
1 parent 8a9b62a commit 2c6c0c6

File tree

1 file changed

+75
-49
lines changed

1 file changed

+75
-49
lines changed

devops/scripts/benchmarks/benches/compute.py

Lines changed: 75 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def git_url(self) -> str:
5353
return "https://github.com/intel/compute-benchmarks.git"
5454

5555
def git_hash(self) -> str:
56-
return "c9e135d4f26dd6badd83009f92f25d6285fc1e21"
56+
return "4995560017559849a519e58978a0afdd55903e15"
5757

5858
def setup(self) -> None:
5959
if options.sycl is None:
@@ -173,6 +173,9 @@ def benchmarks(self) -> list[Benchmark]:
173173
# See SubmitKernel.enabled()
174174
long_kernel_exec_time_ooo = [20, 200]
175175

176+
# The Combo Profiler is available only for selected sycl benchmarks
177+
profiler_types = ["timer", "cpuCounter"]
178+
176179
for runtime in list(RUNTIMES):
177180
# Add SubmitKernel benchmarks using loops
178181
for in_order_queue in [0, 1]:
@@ -184,14 +187,16 @@ def benchmarks(self) -> list[Benchmark]:
184187
else long_kernel_exec_time_ooo
185188
)
186189
for kernel_exec_time in [1, *long_kernel_exec_time]:
187-
benches.append(
188-
SubmitKernel(
189-
self,
190-
runtime,
191-
in_order_queue,
192-
measure_completion,
193-
use_events,
194-
kernel_exec_time,
190+
for profiler_type in profiler_types:
191+
benches.append(
192+
SubmitKernel(
193+
self,
194+
runtime,
195+
in_order_queue,
196+
measure_completion,
197+
use_events,
198+
kernel_exec_time,
199+
profiler_type,
195200
)
196201
)
197202

@@ -203,51 +208,57 @@ def benchmarks(self) -> list[Benchmark]:
203208
)
204209

205210
# Add ULLS benchmarks
206-
benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
211+
for profiler_type in profiler_types:
212+
benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type))
207213
benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
208214

209215
# Add GraphApiSubmitGraph benchmarks
210216
for in_order_queue in [0, 1]:
211-
benches.append(
212-
GraphApiSubmitGraph(
213-
self,
214-
runtime,
215-
in_order_queue,
216-
self.submit_graph_num_kernels[-1],
217-
0,
218-
useEvents=0,
219-
useHostTasks=1,
217+
for profiler_type in profiler_types:
218+
benches.append(
219+
GraphApiSubmitGraph(
220+
self,
221+
runtime,
222+
in_order_queue,
223+
self.submit_graph_num_kernels[-1],
224+
0,
225+
profiler_type,
226+
useEvents=0,
227+
useHostTasks=1,
228+
)
220229
)
221-
)
222230
for num_kernels in self.submit_graph_num_kernels:
223231
for measure_completion_time in [0, 1]:
224232
for use_events in [0, 1]:
225-
benches.append(
226-
GraphApiSubmitGraph(
227-
self,
228-
runtime,
229-
in_order_queue,
230-
num_kernels,
231-
measure_completion_time,
232-
use_events,
233-
useHostTasks=0,
233+
for profiler_type in profiler_types:
234+
benches.append(
235+
GraphApiSubmitGraph(
236+
self,
237+
runtime,
238+
in_order_queue,
239+
num_kernels,
240+
measure_completion_time,
241+
profiler_type,
242+
use_events,
243+
useHostTasks=0,
244+
)
234245
)
235-
)
236246

237247
# Add other benchmarks
238248
benches += [
239-
QueueInOrderMemcpy(self, 0, "Device", "Device", 1024),
240-
QueueInOrderMemcpy(self, 0, "Host", "Device", 1024),
241-
QueueMemcpy(self, "Device", "Device", 1024),
242249
StreamMemory(self, "Triad", 10 * 1024, "Device"),
243-
ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
244-
ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
245250
VectorSum(self),
246251
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Gromacs"),
247252
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Gromacs"),
248253
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"),
249254
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"),
250255
]
256+
for profiler_type in profiler_types:
257+
benches.append(QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type))
258+
benches.append(QueueInOrderMemcpy(self, 0, "Host", "Device", 1024, profiler_type))
259+
benches.append(QueueMemcpy(self, "Device", "Device", 1024, profiler_type))
260+
benches.append(ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024, profiler_type))
261+
benches.append(ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024, profiler_type))
251262

252263
# Add UR-specific benchmarks
253264
benches += [
@@ -295,12 +306,13 @@ def parse_unit_type(compute_unit):
295306

296307

297308
class ComputeBenchmark(Benchmark):
298-
def __init__(self, bench, name, test, runtime: RUNTIMES = None):
309+
def __init__(self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = ""):
299310
super().__init__(bench.directory, bench)
300311
self.bench = bench
301312
self.bench_name = name
302313
self.test = test
303314
self.runtime = runtime
315+
self.profiler_type = profiler_type
304316

305317
def supported_runtimes(self) -> list[RUNTIMES]:
306318
"""Base runtimes supported by this benchmark, can be overridden."""
@@ -428,14 +440,15 @@ def __init__(
428440
MeasureCompletion=0,
429441
UseEvents=0,
430442
KernelExecTime=1,
443+
profiler_type=""
431444
):
432445
self.ioq = ioq
433446
self.MeasureCompletion = MeasureCompletion
434447
self.UseEvents = UseEvents
435448
self.KernelExecTime = KernelExecTime
436449
self.NumKernels = 10
437450
super().__init__(
438-
bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel", runtime
451+
bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel", runtime, profiler_type
439452
)
440453

441454
def supported_runtimes(self) -> list[RUNTIMES]:
@@ -504,7 +517,7 @@ def range(self) -> tuple[float, float]:
504517
return (0.0, None)
505518

506519
def bin_args(self) -> list[str]:
507-
return [
520+
bin_args = [
508521
f"--Ioq={self.ioq}",
509522
f"--MeasureCompletion={self.MeasureCompletion}",
510523
"--iterations=100000",
@@ -513,6 +526,9 @@ def bin_args(self) -> list[str]:
513526
f"--KernelExecTime={self.KernelExecTime}",
514527
f"--UseEvents={self.UseEvents}",
515528
]
529+
if self.runtime == RUNTIMES.SYCL:
530+
bin_args.append(f"--profilerType={self.profiler_type}")
531+
return bin_args
516532

517533
def get_metadata(self) -> dict[str, BenchmarkMetadata]:
518534
metadata_dict = super().get_metadata()
@@ -532,13 +548,13 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
532548

533549

534550
class ExecImmediateCopyQueue(ComputeBenchmark):
535-
def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
551+
def __init__(self, bench, ioq, isCopyOnly, source, destination, size, profiler_type):
536552
self.ioq = ioq
537553
self.isCopyOnly = isCopyOnly
538554
self.source = source
539555
self.destination = destination
540556
self.size = size
541-
super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue")
557+
super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue", profiler_type=profiler_type)
542558

543559
def name(self):
544560
order = "in order" if self.ioq else "out of order"
@@ -569,16 +585,17 @@ def bin_args(self) -> list[str]:
569585
f"--dst={self.destination}",
570586
f"--size={self.size}",
571587
"--withCopyOffload=0",
588+
f"--profilerType={self.profiler_type}",
572589
]
573590

574591

575592
class QueueInOrderMemcpy(ComputeBenchmark):
576-
def __init__(self, bench, isCopyOnly, source, destination, size):
593+
def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
577594
self.isCopyOnly = isCopyOnly
578595
self.source = source
579596
self.destination = destination
580597
self.size = size
581-
super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy")
598+
super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy", profiler_type=profiler_type)
582599

583600
def name(self):
584601
return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
@@ -605,15 +622,16 @@ def bin_args(self) -> list[str]:
605622
f"--size={self.size}",
606623
"--count=100",
607624
"--withCopyOffload=0",
625+
f"--profilerType={self.profiler_type}"
608626
]
609627

610628

611629
class QueueMemcpy(ComputeBenchmark):
612-
def __init__(self, bench, source, destination, size):
630+
def __init__(self, bench, source, destination, size, profiler_type):
613631
self.source = source
614632
self.destination = destination
615633
self.size = size
616-
super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy")
634+
super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy", profiler_type=profiler_type)
617635

618636
def name(self):
619637
return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
@@ -636,6 +654,7 @@ def bin_args(self) -> list[str]:
636654
f"--sourcePlacement={self.source}",
637655
f"--destinationPlacement={self.destination}",
638656
f"--size={self.size}",
657+
f"--profilerType={self.profiler_type}"
639658
]
640659

641660

@@ -858,6 +877,7 @@ def __init__(
858877
inOrderQueue,
859878
numKernels,
860879
measureCompletionTime,
880+
profiler_type,
861881
useEvents,
862882
useHostTasks,
863883
):
@@ -873,7 +893,7 @@ def __init__(
873893
self.use_events_str = f" with events" if self.useEvents else ""
874894
self.host_tasks_str = f" use host tasks" if self.useHostTasks else ""
875895
super().__init__(
876-
bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph", runtime
896+
bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph", runtime, profiler_type
877897
)
878898

879899
def explicit_group(self):
@@ -901,7 +921,7 @@ def get_tags(self):
901921
]
902922

903923
def bin_args(self) -> list[str]:
904-
return [
924+
bin_args = [
905925
"--iterations=10000",
906926
f"--NumKernels={self.numKernels}",
907927
f"--MeasureCompletionTime={self.measureCompletionTime}",
@@ -912,14 +932,17 @@ def bin_args(self) -> list[str]:
912932
"--UseExplicit=0",
913933
f"--UseHostTasks={self.useHostTasks}",
914934
]
935+
if self.runtime == RUNTIMES.SYCL:
936+
bin_args.append(f"--profilerType={self.profiler_type}")
937+
return bin_args
915938

916939

917940
class UllsEmptyKernel(ComputeBenchmark):
918-
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
941+
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type):
919942
self.wgc = wgc
920943
self.wgs = wgs
921944
super().__init__(
922-
bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel", runtime
945+
bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel", runtime, profiler_type
923946
)
924947

925948
def supported_runtimes(self) -> list[RUNTIMES]:
@@ -943,11 +966,14 @@ def get_tags(self):
943966
return [runtime_to_tag_name(self.runtime), "micro", "latency", "submit"]
944967

945968
def bin_args(self) -> list[str]:
946-
return [
969+
bin_args = [
947970
"--iterations=10000",
948971
f"--wgs={self.wgs}",
949972
f"--wgc={self.wgc}",
950973
]
974+
if self.runtime == RUNTIMES.SYCL:
975+
bin_args.append(f"--profilerType={self.profiler_type}")
976+
return bin_args
951977

952978

953979
class UllsKernelSwitch(ComputeBenchmark):

0 commit comments

Comments
 (0)