Skip to content

Commit f60f1b2

Browse files
committed
[Benchmarks] Bump Compute Benchmarks
- stabilize results with a small sleep between binary runs - add combo profiler functionality to allow for choosing between time and cpu count measurement
1 parent c7867ce commit f60f1b2

File tree

1 file changed

+116
-50
lines changed

1 file changed

+116
-50
lines changed

devops/scripts/benchmarks/benches/compute.py

Lines changed: 116 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def git_url(self) -> str:
5454
return "https://github.com/intel/compute-benchmarks.git"
5555

5656
def git_hash(self) -> str:
57-
return "c9e135d4f26dd6badd83009f92f25d6285fc1e21"
57+
return "4995560017559849a519e58978a0afdd55903e15"
5858

5959
def setup(self) -> None:
6060
if options.sycl is None:
@@ -177,6 +177,9 @@ def benchmarks(self) -> list[Benchmark]:
177177
# See SubmitKernel.enabled()
178178
long_kernel_exec_time_ooo = [20, 200]
179179

180+
# The Combo Profiler is available only for selected sycl benchmarks
181+
profiler_types = ["timer", "cpuCounter"]
182+
180183
for runtime in list(RUNTIMES):
181184
# Add SubmitKernel benchmarks using loops
182185
for in_order_queue in [0, 1]:
@@ -188,16 +191,18 @@ def benchmarks(self) -> list[Benchmark]:
188191
else long_kernel_exec_time_ooo
189192
)
190193
for kernel_exec_time in [1, *long_kernel_exec_time]:
191-
benches.append(
192-
SubmitKernel(
193-
self,
194-
runtime,
195-
in_order_queue,
196-
measure_completion,
197-
use_events,
198-
kernel_exec_time,
194+
for profiler_type in profiler_types:
195+
benches.append(
196+
SubmitKernel(
197+
self,
198+
runtime,
199+
in_order_queue,
200+
measure_completion,
201+
use_events,
202+
kernel_exec_time,
203+
profiler_type,
204+
)
199205
)
200-
)
201206

202207
# Add SinKernelGraph benchmarks
203208
for with_graphs in [0, 1]:
@@ -207,51 +212,69 @@ def benchmarks(self) -> list[Benchmark]:
207212
)
208213

209214
# Add ULLS benchmarks
210-
benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
215+
for profiler_type in profiler_types:
216+
benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type))
211217
benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
212218

213219
# Add GraphApiSubmitGraph benchmarks
214220
for in_order_queue in [0, 1]:
215-
benches.append(
216-
GraphApiSubmitGraph(
217-
self,
218-
runtime,
219-
in_order_queue,
220-
self.submit_graph_num_kernels[-1],
221-
0,
222-
useEvents=0,
223-
useHostTasks=1,
221+
for profiler_type in profiler_types:
222+
benches.append(
223+
GraphApiSubmitGraph(
224+
self,
225+
runtime,
226+
in_order_queue,
227+
self.submit_graph_num_kernels[-1],
228+
0,
229+
profiler_type,
230+
useEvents=0,
231+
useHostTasks=1,
232+
)
224233
)
225-
)
226234
for num_kernels in self.submit_graph_num_kernels:
227235
for measure_completion_time in [0, 1]:
228236
for use_events in [0, 1]:
229-
benches.append(
230-
GraphApiSubmitGraph(
231-
self,
232-
runtime,
233-
in_order_queue,
234-
num_kernels,
235-
measure_completion_time,
236-
use_events,
237-
useHostTasks=0,
237+
for profiler_type in profiler_types:
238+
benches.append(
239+
GraphApiSubmitGraph(
240+
self,
241+
runtime,
242+
in_order_queue,
243+
num_kernels,
244+
measure_completion_time,
245+
profiler_type,
246+
use_events,
247+
useHostTasks=0,
248+
)
238249
)
239-
)
240250

241251
# Add other benchmarks
242252
benches += [
243-
QueueInOrderMemcpy(self, 0, "Device", "Device", 1024),
244-
QueueInOrderMemcpy(self, 0, "Host", "Device", 1024),
245-
QueueMemcpy(self, "Device", "Device", 1024),
246253
StreamMemory(self, "Triad", 10 * 1024, "Device"),
247-
ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
248-
ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
249254
VectorSum(self),
250255
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Gromacs"),
251256
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Gromacs"),
252257
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"),
253258
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"),
254259
]
260+
for profiler_type in profiler_types:
261+
benches.append(
262+
QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type)
263+
)
264+
benches.append(
265+
QueueInOrderMemcpy(self, 0, "Host", "Device", 1024, profiler_type)
266+
)
267+
benches.append(QueueMemcpy(self, "Device", "Device", 1024, profiler_type))
268+
benches.append(
269+
ExecImmediateCopyQueue(
270+
self, 0, 1, "Device", "Device", 1024, profiler_type
271+
)
272+
)
273+
benches.append(
274+
ExecImmediateCopyQueue(
275+
self, 1, 1, "Device", "Host", 1024, profiler_type
276+
)
277+
)
255278

256279
# Add UR-specific benchmarks
257280
benches += [
@@ -299,12 +322,15 @@ def parse_unit_type(compute_unit):
299322

300323

301324
class ComputeBenchmark(Benchmark):
302-
def __init__(self, bench, name, test, runtime: RUNTIMES = None):
325+
def __init__(
326+
self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = ""
327+
):
303328
super().__init__(bench.directory, bench)
304329
self.bench = bench
305330
self.bench_name = name
306331
self.test = test
307332
self.runtime = runtime
333+
self.profiler_type = profiler_type
308334
# Mandatory per-benchmark iteration counts.
309335
# Subclasses MUST set both `self.iterations_regular` and
310336
# `self.iterations_trace` (positive ints) in their __init__ before
@@ -465,6 +491,7 @@ def __init__(
465491
MeasureCompletion=0,
466492
UseEvents=0,
467493
KernelExecTime=1,
494+
profiler_type="",
468495
):
469496
self.ioq = ioq
470497
self.MeasureCompletion = MeasureCompletion
@@ -475,7 +502,11 @@ def __init__(
475502
self.iterations_regular = 100000
476503
self.iterations_trace = 10
477504
super().__init__(
478-
bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel", runtime
505+
bench,
506+
f"api_overhead_benchmark_{runtime.value}",
507+
"SubmitKernel",
508+
runtime,
509+
profiler_type,
479510
)
480511

481512
def supported_runtimes(self) -> list[RUNTIMES]:
@@ -545,7 +576,7 @@ def range(self) -> tuple[float, float]:
545576

546577
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
547578
iters = self.get_iters(run_trace)
548-
return [
579+
bin_args = [
549580
f"--iterations={iters}",
550581
f"--Ioq={self.ioq}",
551582
f"--MeasureCompletion={self.MeasureCompletion}",
@@ -554,6 +585,9 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
554585
f"--KernelExecTime={self.KernelExecTime}",
555586
f"--UseEvents={self.UseEvents}",
556587
]
588+
if self.runtime == RUNTIMES.SYCL:
589+
bin_args.append(f"--profilerType={self.profiler_type}")
590+
return bin_args
557591

558592
def get_metadata(self) -> dict[str, BenchmarkMetadata]:
559593
metadata_dict = super().get_metadata()
@@ -573,7 +607,9 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
573607

574608

575609
class ExecImmediateCopyQueue(ComputeBenchmark):
576-
def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
610+
def __init__(
611+
self, bench, ioq, isCopyOnly, source, destination, size, profiler_type
612+
):
577613
self.ioq = ioq
578614
self.isCopyOnly = isCopyOnly
579615
self.source = source
@@ -582,7 +618,12 @@ def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
582618
# iterations per bin_args: --iterations=100000
583619
self.iterations_regular = 100000
584620
self.iterations_trace = 10
585-
super().__init__(bench, "api_overhead_benchmark_sycl", "ExecImmediateCopyQueue")
621+
super().__init__(
622+
bench,
623+
"api_overhead_benchmark_sycl",
624+
"ExecImmediateCopyQueue",
625+
profiler_type=profiler_type,
626+
)
586627

587628
def name(self):
588629
order = "in order" if self.ioq else "out of order"
@@ -614,19 +655,25 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
614655
f"--dst={self.destination}",
615656
f"--size={self.size}",
616657
"--withCopyOffload=0",
658+
f"--profilerType={self.profiler_type}",
617659
]
618660

619661

620662
class QueueInOrderMemcpy(ComputeBenchmark):
621-
def __init__(self, bench, isCopyOnly, source, destination, size):
663+
def __init__(self, bench, isCopyOnly, source, destination, size, profiler_type):
622664
self.isCopyOnly = isCopyOnly
623665
self.source = source
624666
self.destination = destination
625667
self.size = size
626668
# iterations per bin_args: --iterations=10000
627669
self.iterations_regular = 10000
628670
self.iterations_trace = 10
629-
super().__init__(bench, "memory_benchmark_sycl", "QueueInOrderMemcpy")
671+
super().__init__(
672+
bench,
673+
"memory_benchmark_sycl",
674+
"QueueInOrderMemcpy",
675+
profiler_type=profiler_type,
676+
)
630677

631678
def name(self):
632679
return f"memory_benchmark_sycl QueueInOrderMemcpy from {self.source} to {self.destination}, size {self.size}"
@@ -654,18 +701,21 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
654701
f"--size={self.size}",
655702
"--count=100",
656703
"--withCopyOffload=0",
704+
f"--profilerType={self.profiler_type}",
657705
]
658706

659707

660708
class QueueMemcpy(ComputeBenchmark):
661-
def __init__(self, bench, source, destination, size):
709+
def __init__(self, bench, source, destination, size, profiler_type):
662710
self.source = source
663711
self.destination = destination
664712
self.size = size
665713
# iterations per bin_args: --iterations=10000
666714
self.iterations_regular = 10000
667715
self.iterations_trace = 10
668-
super().__init__(bench, "memory_benchmark_sycl", "QueueMemcpy")
716+
super().__init__(
717+
bench, "memory_benchmark_sycl", "QueueMemcpy", profiler_type=profiler_type
718+
)
669719

670720
def name(self):
671721
return f"memory_benchmark_sycl QueueMemcpy from {self.source} to {self.destination}, size {self.size}"
@@ -689,6 +739,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
689739
f"--sourcePlacement={self.source}",
690740
f"--destinationPlacement={self.destination}",
691741
f"--size={self.size}",
742+
f"--profilerType={self.profiler_type}",
692743
]
693744

694745

@@ -927,6 +978,7 @@ def __init__(
927978
inOrderQueue,
928979
numKernels,
929980
measureCompletionTime,
981+
profiler_type,
930982
useEvents,
931983
useHostTasks,
932984
):
@@ -945,7 +997,11 @@ def __init__(
945997
self.iterations_regular = 10000
946998
self.iterations_trace = 10
947999
super().__init__(
948-
bench, f"graph_api_benchmark_{runtime.value}", "SubmitGraph", runtime
1000+
bench,
1001+
f"graph_api_benchmark_{runtime.value}",
1002+
"SubmitGraph",
1003+
runtime,
1004+
profiler_type,
9491005
)
9501006

9511007
def explicit_group(self):
@@ -974,7 +1030,7 @@ def get_tags(self):
9741030

9751031
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
9761032
iters = self.get_iters(run_trace)
977-
return [
1033+
bin_args = [
9781034
f"--iterations={iters}",
9791035
f"--NumKernels={self.numKernels}",
9801036
f"--MeasureCompletionTime={self.measureCompletionTime}",
@@ -985,17 +1041,24 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
9851041
"--UseExplicit=0",
9861042
f"--UseHostTasks={self.useHostTasks}",
9871043
]
1044+
if self.runtime == RUNTIMES.SYCL:
1045+
bin_args.append(f"--profilerType={self.profiler_type}")
1046+
return bin_args
9881047

9891048

9901049
class UllsEmptyKernel(ComputeBenchmark):
991-
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs):
1050+
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type):
9921051
self.wgc = wgc
9931052
self.wgs = wgs
9941053
# iterations per bin_args: --iterations=10000
9951054
self.iterations_regular = 10000
9961055
self.iterations_trace = 10
9971056
super().__init__(
998-
bench, f"ulls_benchmark_{runtime.value}", "EmptyKernel", runtime
1057+
bench,
1058+
f"ulls_benchmark_{runtime.value}",
1059+
"EmptyKernel",
1060+
runtime,
1061+
profiler_type,
9991062
)
10001063

10011064
def supported_runtimes(self) -> list[RUNTIMES]:
@@ -1020,11 +1083,14 @@ def get_tags(self):
10201083

10211084
def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
10221085
iters = self.get_iters(run_trace)
1023-
return [
1086+
bin_args = [
10241087
f"--iterations={iters}",
10251088
f"--wgs={self.wgs}",
10261089
f"--wgc={self.wgc}",
10271090
]
1091+
if self.runtime == RUNTIMES.SYCL:
1092+
bin_args.append(f"--profilerType={self.profiler_type}")
1093+
return bin_args
10281094

10291095

10301096
class UllsKernelSwitch(ComputeBenchmark):

0 commit comments

Comments
 (0)