Skip to content

Commit fae04f4

Browse files
committed
cleanup compute bench, fix readme, use newer sycl-bench
1 parent 1041db6 commit fae04f4

File tree

5 files changed

+99
-129
lines changed

5 files changed

+99
-129
lines changed

devops/scripts/benchmarks/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ Scripts for running performance tests on SYCL and Unified Runtime.
66

77
- [Velocity Bench](https://github.com/oneapi-src/Velocity-Bench)
88
- [Compute Benchmarks](https://github.com/intel/compute-benchmarks/)
9+
- [LlamaCpp Benchmarks](https://github.com/ggerganov/llama.cpp)
10+
- [SYCL-Bench](https://github.com/unisa-hpc/sycl-bench)
911

1012
## Running
1113

@@ -27,8 +29,6 @@ You can also include additional benchmark parameters, such as environment variab
2729

2830
Once all the required information is entered, click the "Run workflow" button to initiate a new workflow run. This will execute the benchmarks and then post the results as a comment on the specified Pull Request.
2931

30-
By default, all benchmark runs are compared against `baseline`, which is a well-established set of the latest data.
31-
3232
You must be a member of the `oneapi-src` organization to access these features.
3333

3434
## Comparing results
@@ -37,8 +37,8 @@ By default, the benchmark results are not stored. To store them, use the option
3737

3838
You can compare benchmark results using `--compare` option. The comparison will be presented in a markdown output file (see below). If you want to calculate the relative performance of the new results against the previously saved data, use `--compare <previously_saved_data>` (i.e. `--compare baseline`). In case of comparing only stored data without generating new results, use `--dry-run --compare <name1> --compare <name2> --relative-perf <name1>`, where `name1` indicates the baseline for the relative performance calculation and `--dry-run` prevents the script for running benchmarks. Listing more than two `--compare` options results in displaying only execution time, without statistical analysis.
3939

40-
Baseline, as well as baseline-v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
41-
are stored [here](https://oneapi-src.github.io/unified-runtime/benchmark_results.html).
40+
Baseline_L0, as well as Baseline_L0v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
41+
are stored [here](https://oneapi-src.github.io/unified-runtime/performance/).
4242

4343
## Output formats
4444
You can display the results in the form of a HTML file by using `--ouptut-html` and a markdown file by using `--output-markdown`. Due to character limits for posting PR comments, the final content of the markdown file might be reduced. In order to obtain the full markdown output, use `--output-markdown full`.

devops/scripts/benchmarks/benches/base.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,6 @@ def download(
7575
self.data_path = self.create_data_path(name, skip_data_dir)
7676
return download(self.data_path, url, file, untar, unzip, checksum)
7777

78-
def name(self):
79-
raise NotImplementedError()
80-
81-
def description(self):
82-
return "No description provided."
83-
8478
def lower_is_better(self):
8579
return True
8680

@@ -99,8 +93,11 @@ def stddev_threshold(self):
9993
def get_suite_name(self) -> str:
10094
return self.suite.name()
10195

102-
def result_names(self) -> list[str]:
103-
return [self.name()]
96+
def name(self):
97+
raise NotImplementedError()
98+
99+
def description(self):
100+
return "No description provided."
104101

105102
def notes(self) -> str:
106103
return None

devops/scripts/benchmarks/benches/compute.py

Lines changed: 87 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,20 @@
1313
from enum import Enum
1414

1515

16+
class RUNTIMES(Enum):
17+
SYCL = "sycl"
18+
LEVEL_ZERO = "l0"
19+
UR = "ur"
20+
21+
22+
def runtime_to_name(runtime: RUNTIMES) -> str:
23+
return {
24+
RUNTIMES.SYCL: "SYCL",
25+
RUNTIMES.LEVEL_ZERO: "Level Zero",
26+
RUNTIMES.UR: "Unified Runtime",
27+
}[runtime]
28+
29+
1630
class ComputeBench(Suite):
1731
def __init__(self, directory):
1832
self.directory = directory
@@ -70,64 +84,78 @@ def additionalMetadata(self) -> dict[str, BenchmarkMetadata]:
7084
),
7185
}
7286

87+
def enabled_runtimes(self, supported_runtimes=None):
88+
# all runtimes in the RUNTIMES enum
89+
runtimes = supported_runtimes or list(RUNTIMES)
90+
91+
# Filter out UR if not available
92+
if options.ur is None:
93+
runtimes = [r for r in runtimes if r != RUNTIMES.UR]
94+
95+
return runtimes
96+
7397
def benchmarks(self) -> list[Benchmark]:
7498
if options.sycl is None:
7599
return []
76100

77101
if options.ur_adapter == "cuda":
78102
return []
79103

80-
benches = [
81-
SubmitKernelL0(self, 0),
82-
SubmitKernelL0(self, 1),
83-
SubmitKernelSYCL(self, 0),
84-
SubmitKernelSYCL(self, 1),
104+
benches = []
105+
106+
# Add SubmitKernel benchmarks using loops
107+
for runtime in self.enabled_runtimes():
108+
for in_order_queue in [0, 1]:
109+
for measure_completion in [0, 1]:
110+
benches.append(
111+
SubmitKernel(self, runtime, in_order_queue, measure_completion)
112+
)
113+
114+
# Add SinKernelGraph benchmarks
115+
for runtime in self.enabled_runtimes():
116+
for with_graphs in [0, 1]:
117+
for num_kernels in [5, 100]:
118+
benches.append(
119+
GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
120+
)
121+
122+
# Add ULLS benchmarks
123+
for runtime in self.enabled_runtimes([RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]):
124+
benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
125+
benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
126+
127+
# Add GraphApiSubmitGraph benchmarks
128+
for runtime in self.enabled_runtimes([RUNTIMES.SYCL]):
129+
for in_order_queue in [0, 1]:
130+
for num_kernels in [4, 10, 32]:
131+
for measure_completion_time in [0, 1]:
132+
benches.append(
133+
GraphApiSubmitGraph(
134+
self,
135+
runtime,
136+
in_order_queue,
137+
num_kernels,
138+
measure_completion_time,
139+
)
140+
)
141+
142+
# Add other benchmarks
143+
benches += [
85144
QueueInOrderMemcpy(self, 0, "Device", "Device", 1024),
86145
QueueInOrderMemcpy(self, 0, "Host", "Device", 1024),
87146
QueueMemcpy(self, "Device", "Device", 1024),
88147
StreamMemory(self, "Triad", 10 * 1024, "Device"),
89148
ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
90149
ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
91150
VectorSum(self),
92-
GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 5),
93-
GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 1, 5),
94-
GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 100),
95-
GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 1, 100),
96-
GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 0, 5),
97-
GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 1, 5),
98-
GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 0, 100),
99-
GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 1, 100),
100-
UllsEmptyKernel(self, RUNTIMES.SYCL, 1000, 256),
101-
UllsEmptyKernel(self, RUNTIMES.LEVEL_ZERO, 1000, 256),
102-
UllsKernelSwitch(self, RUNTIMES.SYCL, 8, 200, 0, 0, 1, 1),
103-
UllsKernelSwitch(self, RUNTIMES.LEVEL_ZERO, 8, 200, 0, 0, 1, 1),
104151
]
105152

106-
for in_order_queue in [0, 1]:
107-
for num_kernels in [4, 32]:
108-
for measure_completion_time in [0, 1]:
109-
benches.append(
110-
GraphApiSubmitGraph(
111-
self,
112-
RUNTIMES.SYCL,
113-
in_order_queue,
114-
num_kernels,
115-
measure_completion_time,
116-
)
117-
)
118-
153+
# Add UR-specific benchmarks
119154
if options.ur is not None:
120155
benches += [
121-
SubmitKernelUR(self, 0, 0),
122-
SubmitKernelUR(self, 1, 0),
123-
SubmitKernelUR(self, 1, 1),
124156
MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1),
125157
MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1),
126158
MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0),
127-
GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 5),
128-
GraphApiSinKernelGraph(self, RUNTIMES.UR, 1, 5),
129-
GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 100),
130-
GraphApiSinKernelGraph(self, RUNTIMES.UR, 1, 100),
131159
]
132160

133161
return benches
@@ -228,98 +256,49 @@ def teardown(self):
228256
return
229257

230258

231-
class SubmitKernelSYCL(ComputeBenchmark):
232-
def __init__(self, bench, ioq):
259+
class SubmitKernel(ComputeBenchmark):
260+
def __init__(self, bench, runtime: RUNTIMES, ioq, measure_completion=0):
233261
self.ioq = ioq
234-
super().__init__(bench, "api_overhead_benchmark_sycl", "SubmitKernel")
262+
self.runtime = runtime
263+
self.measure_completion = measure_completion
264+
super().__init__(
265+
bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel"
266+
)
235267

236268
def name(self):
237269
order = "in order" if self.ioq else "out of order"
238-
return f"api_overhead_benchmark_sycl SubmitKernel {order}"
270+
completion_str = " with measure completion" if self.measure_completion else ""
271+
return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}"
239272

240273
def explicit_group(self):
241-
return "SubmitKernel"
242-
243-
def bin_args(self) -> list[str]:
244-
return [
245-
f"--Ioq={self.ioq}",
246-
"--DiscardEvents=0",
247-
"--MeasureCompletion=0",
248-
"--iterations=100000",
249-
"--Profiling=0",
250-
"--NumKernels=10",
251-
"--KernelExecTime=1",
252-
]
253-
254-
def description(self) -> str:
255-
order = "in-order" if self.ioq else "out-of-order"
256274
return (
257-
f"Measures CPU time overhead of submitting {order} kernels through SYCL API."
258-
"Uses 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time."
259-
)
260-
261-
262-
class SubmitKernelUR(ComputeBenchmark):
263-
def __init__(self, bench, ioq, measureCompletion):
264-
self.ioq = ioq
265-
self.measureCompletion = measureCompletion
266-
super().__init__(bench, "api_overhead_benchmark_ur", "SubmitKernel")
267-
268-
def name(self):
269-
order = "in order" if self.ioq else "out of order"
270-
return f"api_overhead_benchmark_ur SubmitKernel {order}" + (
271-
" with measure completion" if self.measureCompletion else ""
275+
"SubmitKernel"
276+
if self.measure_completion == 0
277+
else "SubmitKernel With Completion"
272278
)
273279

274-
def explicit_group(self):
275-
return "SubmitKernel"
276-
277280
def description(self) -> str:
278281
order = "in-order" if self.ioq else "out-of-order"
279-
completion = "including" if self.measureCompletion else "excluding"
280-
return (
281-
f"Measures CPU time overhead of submitting {order} kernels through Unified Runtime API, "
282-
f"{completion} kernel completion time. Uses 10 simple kernels with minimal execution time "
283-
f"to isolate API overhead."
284-
)
282+
runtime_name = runtime_to_name(self.runtime)
285283

286-
def bin_args(self) -> list[str]:
287-
return [
288-
f"--Ioq={self.ioq}",
289-
"--DiscardEvents=0",
290-
f"--MeasureCompletion={self.measureCompletion}",
291-
"--iterations=100000",
292-
"--Profiling=0",
293-
"--NumKernels=10",
294-
"--KernelExecTime=1",
295-
]
296-
297-
298-
class SubmitKernelL0(ComputeBenchmark):
299-
def __init__(self, bench, ioq):
300-
self.ioq = ioq
301-
super().__init__(bench, "api_overhead_benchmark_l0", "SubmitKernel")
302-
303-
def name(self):
304-
order = "in order" if self.ioq else "out of order"
305-
return f"api_overhead_benchmark_l0 SubmitKernel {order}"
284+
completion_desc = ""
285+
if self.runtime == RUNTIMES.UR:
286+
completion_desc = f", {'including' if self.measure_completion else 'excluding'} kernel completion time"
306287

307-
def explicit_group(self):
308-
return "SubmitKernel"
288+
l0_specific = ""
289+
if self.runtime == RUNTIMES.LEVEL_ZERO:
290+
l0_specific = " Uses immediate command lists"
309291

310-
def description(self) -> str:
311-
order = "in-order" if self.ioq else "out-of-order"
312292
return (
313-
f"Measures CPU time overhead of submitting {order} kernels through Level Zero API. "
314-
f"Uses immediate command lists with 10 minimal kernels to isolate submission overhead "
315-
f"from execution time."
293+
f"Measures CPU time overhead of submitting {order} kernels through {runtime_name} API{completion_desc}. "
294+
f"Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time. {l0_specific}"
316295
)
317296

318297
def bin_args(self) -> list[str]:
319298
return [
320299
f"--Ioq={self.ioq}",
321300
"--DiscardEvents=0",
322-
"--MeasureCompletion=0",
301+
f"--MeasureCompletion={self.measure_completion}",
323302
"--iterations=100000",
324303
"--Profiling=0",
325304
"--NumKernels=10",
@@ -521,12 +500,6 @@ def bin_args(self) -> list[str]:
521500
]
522501

523502

524-
class RUNTIMES(Enum):
525-
SYCL = "sycl"
526-
LEVEL_ZERO = "l0"
527-
UR = "ur"
528-
529-
530503
class GraphApiSinKernelGraph(ComputeBenchmark):
531504
def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
532505
self.withGraphs = withGraphs

devops/scripts/benchmarks/benches/syclbench.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ def setup(self):
3131
repo_path = git_clone(
3232
self.directory,
3333
"sycl-bench-repo",
34-
"https://github.com/mateuszpn/sycl-bench.git",
35-
"1e6ab2cfd004a72c5336c26945965017e06eab71",
34+
"https://github.com/unisa-hpc/sycl-bench.git",
35+
"31fc70be6266193c4ba60eb1fe3ce26edee4ca5b",
3636
)
3737

3838
configure_command = [

devops/scripts/benchmarks/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ def validate_and_parse_env_args(env_args):
333333
parser.add_argument(
334334
"--adapter",
335335
type=str,
336-
help="Options to build the Unified Runtime as part of the benchmark",
336+
help="Unified Runtime adapter to use.",
337337
default="level_zero",
338338
)
339339
parser.add_argument(

0 commit comments

Comments
 (0)