Skip to content

Commit 0eb7c73

Browse files
authored
[Benchmarks] Bump Compute Benchmarks (#19857)
- stabilize results with a small sleep between binary runs - add combo profiler functionality which allows for choosing between time and cpu count measurement
1 parent fe8a292 commit 0eb7c73

File tree

1 file changed

+122
-51
lines changed

1 file changed

+122
-51
lines changed

devops/scripts/benchmarks/benches/compute.py

Lines changed: 122 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def git_url(self) -> str:
5454
return "https://github.com/intel/compute-benchmarks.git"
5555

5656
def git_hash(self) -> str:
57-
return "c9e135d4f26dd6badd83009f92f25d6285fc1e21"
57+
return "4995560017559849a519e58978a0afdd55903e15"
5858

5959
def setup(self) -> None:
6060
if options.sycl is None:
@@ -177,6 +177,9 @@ def benchmarks(self) -> list[Benchmark]:
177177
# See SubmitKernel.enabled()
178178
long_kernel_exec_time_ooo = [20, 200]
179179

180+
# The Combo Profiler is available only for selected sycl benchmarks
181+
profiler_types = ["timer", "cpuCounter"]
182+
180183
for runtime in list(RUNTIMES):
181184
# Add SubmitKernel benchmarks using loops
182185
for in_order_queue in [0, 1]:
@@ -188,16 +191,18 @@ def benchmarks(self) -> list[Benchmark]:
188191
else long_kernel_exec_time_ooo
189192
)
190193
for kernel_exec_time in [1, *long_kernel_exec_time]:
191-
benches.append(
192-
SubmitKernel(
193-
self,
194-
runtime,
195-
in_order_queue,
196-
measure_completion,
197-
use_events,
198-
kernel_exec_time,
194+
for profiler_type in profiler_types:
195+
benches.append(
196+
SubmitKernel(
197+
self,
198+
runtime,
199+
in_order_queue,
200+
measure_completion,
201+
use_events,
202+
kernel_exec_time,
203+
profiler_type,
204+
)
199205
)
200-
)
201206

202207
# Add SinKernelGraph benchmarks
203208
for with_graphs in [0, 1]:
@@ -207,51 +212,69 @@ def benchmarks(self) -> list[Benchmark]:
207212
)
208213

209214
# Add ULLS benchmarks
210-
benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
215+
for profiler_type in profiler_types:
216+
benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type))
211217
benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
212218

213219
# Add GraphApiSubmitGraph benchmarks
214220
for in_order_queue in [0, 1]:
215-
benches.append(
216-
GraphApiSubmitGraph(
217-
self,
218-
runtime,
219-
in_order_queue,
220-
self.submit_graph_num_kernels[-1],
221-
0,
222-
useEvents=0,
223-
useHostTasks=1,
221+
for profiler_type in profiler_types:
222+
benches.append(
223+
GraphApiSubmitGraph(
224+
self,
225+
runtime,
226+
in_order_queue,
227+
self.submit_graph_num_kernels[-1],
228+
0,
229+
profiler_type,
230+
useEvents=0,
231+
useHostTasks=1,
232+
)
224233
)
225-
)
226234
for num_kernels in self.submit_graph_num_kernels:
227235
for measure_completion_time in [0, 1]:
228236
for use_events in [0, 1]:
229-
benches.append(
230-
GraphApiSubmitGraph(
231-
self,
232-
runtime,
233-
in_order_queue,
234-
num_kernels,
235-
measure_completion_time,
236-
use_events,
237-
useHostTasks=0,
237+
for profiler_type in profiler_types:
238+
benches.append(
239+
GraphApiSubmitGraph(
240+
self,
241+
runtime,
242+
in_order_queue,
243+
num_kernels,
244+
measure_completion_time,
245+
profiler_type,
246+
use_events,
247+
useHostTasks=0,
248+
)
238249
)
239-
)
240250

241251
# Add other benchmarks
242252
benches += [
243-
QueueInOrderMemcpy(self, 0, "Device", "Device", 1024),
244-
QueueInOrderMemcpy(self, 0, "Host", "Device", 1024),
245-
QueueMemcpy(self, "Device", "Device", 1024),
246253
StreamMemory(self, "Triad", 10 * 1024, "Device"),
247-
ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
248-
ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
249254
VectorSum(self),
250255
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Gromacs"),
251256
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Gromacs"),
252257
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"),
253258
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"),
254259
]
260+
for profiler_type in profiler_types:
261+
benches.append(
262+
QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type)
263+
)
264+
benches.append(
265+
QueueInOrderMemcpy(self, 0, "Host", "Device", 1024, profiler_type)
266+
)
267+
benches.append(QueueMemcpy(self, "Device", "Device", 1024, profiler_type))
268+
benches.append(
269+
ExecImmediateCopyQueue(
270+
self, 0, 1, "Device", "Device", 1024, profiler_type
271+
)
272+
)
273+
benches.append(
274+
ExecImmediateCopyQueue(
275+
self, 1, 1, "Device", "Host", 1024, profiler_type
276+
)
277+
)
255278

256279
# Add UR-specific benchmarks
257280
benches += [
@@ -299,12 +322,15 @@ def parse_unit_type(compute_unit):
299322

300323

301324
class ComputeBenchmark(Benchmark):
302-
def __init__(self, bench, name, test, runtime: RUNTIMES = None):
325+
def __init__(
326+
self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = ""
327+
):
303328
super().__init__(bench.directory, bench)
304329
self.bench = bench
305330
self.bench_name = name
306331
self.test = test
307332
self.runtime = runtime
333+
self.profiler_type = profiler_type
308334
# Mandatory per-benchmark iteration counts.
309335
# Subclasses MUST set both `self.iterations_regular` and
310336
# `self.iterations_trace` (positive ints) in their __init__ before
@@ -465,6 +491,7 @@ def __init__(
465491
MeasureCompletion=0,
466492
UseEvents=0,
467493
KernelExecTime=1,
494+
profiler_type="",
468495
):
469496
self.ioq = ioq
470497
self.MeasureCompletion = MeasureCompletion
@@ -475,7 +502,11 @@ def __init__(
475502
self.iterations_regular = 100000
476503
self.iterations_trace = 10
477504
super().__init__(
478-
bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel", runtime
505+
bench,
506+
f"api_overhead_benchmark_{runtime.value}",
507+
"SubmitKernel",
508+
runtime,
509+
profiler_type,
479510
)
480511

481512
def supported_runtimes(self) -> list[RUNTIMES]:
@@ -486,9 +517,14 @@ def enabled(self) -> bool:
486517
# The benchmark instance gets created just to make metadata for these old results
487518
if not super().enabled():
488519
return False
489-
if "bmg" in options.device_architecture and self.KernelExecTime == 20:
520+
521+
device_arch = getattr(options, "device_architecture", "")
522+
if "bmg" in device_arch and self.KernelExecTime == 20:
490523
# Disable this benchmark for BMG server, just create metadata
491524
return False
525+
if "bmg" not in device_arch and self.KernelExecTime == 200:
526+
# Disable KernelExecTime=200 for non-BMG systems, just create metadata
527+
return False
492528
return True
493529

494530
def get_tags(self):
@@ -545,7 +581,7 @@ def range(self) -> tuple[float, float]:
545581

546582
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
547583
iters = self.get_iters(run_trace)
548-
return [
584+
bin_args = [
549585
f"--iterations={iters}",
550586
f"--Ioq={self.ioq}",
551587
f"--MeasureCompletion={self.MeasureCompletion}",
@@ -554,6 +590,9 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
554590
f"--KernelExecTime={self.KernelExecTime}",
555591
f"--UseEvents={self.UseEvents}",
556592
]
593+
if self.runtime == RUNTIMES.SYCL:
594+
bin_args.append(f"--profilerType={self.profiler_type}")
595+
return bin_args
557596

558597
def get_metadata(self) -> dict[str, BenchmarkMetadata]:
559598
metadata_dict = super().get_metadata()
@@ -573,7 +612,9 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
573612

574613

575614
class ExecImmediateCopyQueue(ComputeBenchmark):
576-
def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
615+
def __init__(
616+
self, bench, ioq, isCopyOnly, source, destination, size, profiler_type
617+
):
577618
self.ioq = ioq
578619
self.isCopyOnly = isCopyOnly
579620
self.source = source
@@ -582,7 +623,12 @@ def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
582623
# iterations per bin_args: --iterations=100000
583624
self.iterations_regular = 100000
584625
self.iterations_trace = 10
585-
super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue")
626+
super().__init__(
627+
bench,
628+
"api_overhead_benchmark_sycl",
629+
"ExecImmediateCopyQueue",
630+
profiler_type=profiler_type,
631+
)
586632

587633
def name(self):
588634
order = "in order" if self.ioq else "out of order"
@@ -614,19 +660,25 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
614660
f"--dst={self.destination}",
615661
f"--size={self.size}",
616662
"--withCopyOffload=0",
663+
f"--profilerType={self.profiler_type}",
617664
]
618665

619666

620667
class QueueInOrderMemcpy(ComputeBenchmark):
621-
def __init__(self, bench, isCopyOnly, source, destination, size):
668+
def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
622669
self.isCopyOnly = isCopyOnly
623670
self.source = source
624671
self.destination = destination
625672
self.size = size
626673
# iterations per bin_args: --iterations=10000
627674
self.iterations_regular = 10000
628675
self.iterations_trace = 10
629-
super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy")
676+
super().__init__(
677+
bench,
678+
"memory_benchmark_sycl",
679+
"QueueInOrderMemcpy",
680+
profiler_type=profiler_type,
681+
)
630682

631683
def name(self):
632684
return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
@@ -654,18 +706,21 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
654706
f"--size={self.size}",
655707
"--count=100",
656708
"--withCopyOffload=0",
709+
f"--profilerType={self.profiler_type}",
657710
]
658711

659712

660713
class QueueMemcpy(ComputeBenchmark):
661-
def __init__(self, bench, source, destination, size):
714+
def __init__(self, bench, source, destination, size, profiler_type):
662715
self.source = source
663716
self.destination = destination
664717
self.size = size
665718
# iterations per bin_args: --iterations=10000
666719
self.iterations_regular = 10000
667720
self.iterations_trace = 10
668-
super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy")
721+
super().__init__(
722+
bench, "memory_benchmark_sycl", "QueueMemcpy", profiler_type=profiler_type
723+
)
669724

670725
def name(self):
671726
return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
@@ -689,6 +744,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
689744
f"--sourcePlacement={self.source}",
690745
f"--destinationPlacement={self.destination}",
691746
f"--size={self.size}",
747+
f"--profilerType={self.profiler_type}",
692748
]
693749

694750

@@ -927,6 +983,7 @@ def __init__(
927983
inOrderQueue,
928984
numKernels,
929985
measureCompletionTime,
986+
profiler_type,
930987
useEvents,
931988
useHostTasks,
932989
):
@@ -945,7 +1002,11 @@ def __init__(
9451002
self.iterations_regular = 10000
9461003
self.iterations_trace = 10
9471004
super().__init__(
948-
bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph", runtime
1005+
bench,
1006+
f"graph_api_benchmark_{runtime.value}",
1007+
"SubmitGraph",
1008+
runtime,
1009+
profiler_type,
9491010
)
9501011

9511012
def explicit_group(self):
@@ -974,7 +1035,7 @@ def get_tags(self):
9741035

9751036
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
9761037
iters = self.get_iters(run_trace)
977-
return [
1038+
bin_args = [
9781039
f"--iterations={iters}",
9791040
f"--NumKernels={self.numKernels}",
9801041
f"--MeasureCompletionTime={self.measureCompletionTime}",
@@ -985,17 +1046,24 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
9851046
"--UseExplicit=0",
9861047
f"--UseHostTasks={self.useHostTasks}",
9871048
]
1049+
if self.runtime == RUNTIMES.SYCL:
1050+
bin_args.append(f"--profilerType={self.profiler_type}")
1051+
return bin_args
9881052

9891053

9901054
class UllsEmptyKernel(ComputeBenchmark):
991-
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
1055+
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type):
9921056
self.wgc = wgc
9931057
self.wgs = wgs
9941058
# iterations per bin_args: --iterations=10000
9951059
self.iterations_regular = 10000
9961060
self.iterations_trace = 10
9971061
super().__init__(
998-
bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel", runtime
1062+
bench,
1063+
f"ulls_benchmark_{runtime.value}",
1064+
"EmptyKernel",
1065+
runtime,
1066+
profiler_type,
9991067
)
10001068

10011069
def supported_runtimes(self) -> list[RUNTIMES]:
@@ -1020,11 +1088,14 @@ def get_tags(self):
10201088

10211089
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
10221090
iters = self.get_iters(run_trace)
1023-
return [
1091+
bin_args = [
10241092
f"--iterations={iters}",
10251093
f"--wgs={self.wgs}",
10261094
f"--wgc={self.wgc}",
10271095
]
1096+
if self.runtime == RUNTIMES.SYCL:
1097+
bin_args.append(f"--profilerType={self.profiler_type}")
1098+
return bin_args
10281099

10291100

10301101
class UllsKernelSwitch(ComputeBenchmark):

0 commit comments

Comments
 (0)