|
13 | 13 | from enum import Enum |
14 | 14 |
|
15 | 15 |
|
| 16 | +class RUNTIMES(Enum): |
| 17 | + SYCL = "sycl" |
| 18 | + LEVEL_ZERO = "l0" |
| 19 | + UR = "ur" |
| 20 | + |
| 21 | + |
| 22 | +def runtime_to_name(runtime: RUNTIMES) -> str: |
| 23 | + return { |
| 24 | + RUNTIMES.SYCL: "SYCL", |
| 25 | + RUNTIMES.LEVEL_ZERO: "Level Zero", |
| 26 | + RUNTIMES.UR: "Unified Runtime", |
| 27 | + }[runtime] |
| 28 | + |
| 29 | + |
16 | 30 | class ComputeBench(Suite): |
17 | 31 | def __init__(self, directory): |
18 | 32 | self.directory = directory |
@@ -70,64 +84,78 @@ def additionalMetadata(self) -> dict[str, BenchmarkMetadata]: |
70 | 84 | ), |
71 | 85 | } |
72 | 86 |
|
| 87 | + def enabled_runtimes(self, supported_runtimes=None): |
| 88 | + # all runtimes in the RUNTIMES enum |
| 89 | + runtimes = supported_runtimes or list(RUNTIMES) |
| 90 | + |
| 91 | + # Filter out UR if not available |
| 92 | + if options.ur is None: |
| 93 | + runtimes = [r for r in runtimes if r != RUNTIMES.UR] |
| 94 | + |
| 95 | + return runtimes |
| 96 | + |
73 | 97 | def benchmarks(self) -> list[Benchmark]: |
74 | 98 | if options.sycl is None: |
75 | 99 | return [] |
76 | 100 |
|
77 | 101 | if options.ur_adapter == "cuda": |
78 | 102 | return [] |
79 | 103 |
|
80 | | - benches = [ |
81 | | - SubmitKernelL0(self, 0), |
82 | | - SubmitKernelL0(self, 1), |
83 | | - SubmitKernelSYCL(self, 0), |
84 | | - SubmitKernelSYCL(self, 1), |
| 104 | + benches = [] |
| 105 | + |
| 106 | + # Add SubmitKernel benchmarks using loops |
| 107 | + for runtime in self.enabled_runtimes(): |
| 108 | + for in_order_queue in [0, 1]: |
| 109 | + for measure_completion in [0, 1]: |
| 110 | + benches.append( |
| 111 | + SubmitKernel(self, runtime, in_order_queue, measure_completion) |
| 112 | + ) |
| 113 | + |
| 114 | + # Add SinKernelGraph benchmarks |
| 115 | + for runtime in self.enabled_runtimes(): |
| 116 | + for with_graphs in [0, 1]: |
| 117 | + for num_kernels in [5, 100]: |
| 118 | + benches.append( |
| 119 | + GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels) |
| 120 | + ) |
| 121 | + |
| 122 | + # Add ULLS benchmarks |
| 123 | + for runtime in self.enabled_runtimes([RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]): |
| 124 | + benches.append(UllsEmptyKernel(self, runtime, 1000, 256)) |
| 125 | + benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1)) |
| 126 | + |
| 127 | + # Add GraphApiSubmitGraph benchmarks |
| 128 | + for runtime in self.enabled_runtimes([RUNTIMES.SYCL]): |
| 129 | + for in_order_queue in [0, 1]: |
| 130 | + for num_kernels in [4, 10, 32]: |
| 131 | + for measure_completion_time in [0, 1]: |
| 132 | + benches.append( |
| 133 | + GraphApiSubmitGraph( |
| 134 | + self, |
| 135 | + runtime, |
| 136 | + in_order_queue, |
| 137 | + num_kernels, |
| 138 | + measure_completion_time, |
| 139 | + ) |
| 140 | + ) |
| 141 | + |
| 142 | + # Add other benchmarks |
| 143 | + benches += [ |
85 | 144 | QueueInOrderMemcpy(self, 0, "Device", "Device", 1024), |
86 | 145 | QueueInOrderMemcpy(self, 0, "Host", "Device", 1024), |
87 | 146 | QueueMemcpy(self, "Device", "Device", 1024), |
88 | 147 | StreamMemory(self, "Triad", 10 * 1024, "Device"), |
89 | 148 | ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024), |
90 | 149 | ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024), |
91 | 150 | VectorSum(self), |
92 | | - GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 5), |
93 | | - GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 1, 5), |
94 | | - GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 100), |
95 | | - GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 1, 100), |
96 | | - GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 0, 5), |
97 | | - GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 1, 5), |
98 | | - GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 0, 100), |
99 | | - GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 1, 100), |
100 | | - UllsEmptyKernel(self, RUNTIMES.SYCL, 1000, 256), |
101 | | - UllsEmptyKernel(self, RUNTIMES.LEVEL_ZERO, 1000, 256), |
102 | | - UllsKernelSwitch(self, RUNTIMES.SYCL, 8, 200, 0, 0, 1, 1), |
103 | | - UllsKernelSwitch(self, RUNTIMES.LEVEL_ZERO, 8, 200, 0, 0, 1, 1), |
104 | 151 | ] |
105 | 152 |
|
106 | | - for in_order_queue in [0, 1]: |
107 | | - for num_kernels in [4, 32]: |
108 | | - for measure_completion_time in [0, 1]: |
109 | | - benches.append( |
110 | | - GraphApiSubmitGraph( |
111 | | - self, |
112 | | - RUNTIMES.SYCL, |
113 | | - in_order_queue, |
114 | | - num_kernels, |
115 | | - measure_completion_time, |
116 | | - ) |
117 | | - ) |
118 | | - |
| 153 | + # Add UR-specific benchmarks |
119 | 154 | if options.ur is not None: |
120 | 155 | benches += [ |
121 | | - SubmitKernelUR(self, 0, 0), |
122 | | - SubmitKernelUR(self, 1, 0), |
123 | | - SubmitKernelUR(self, 1, 1), |
124 | 156 | MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1), |
125 | 157 | MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1), |
126 | 158 | MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0), |
127 | | - GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 5), |
128 | | - GraphApiSinKernelGraph(self, RUNTIMES.UR, 1, 5), |
129 | | - GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 100), |
130 | | - GraphApiSinKernelGraph(self, RUNTIMES.UR, 1, 100), |
131 | 159 | ] |
132 | 160 |
|
133 | 161 | return benches |
@@ -228,98 +256,49 @@ def teardown(self): |
228 | 256 | return |
229 | 257 |
|
230 | 258 |
|
231 | | -class SubmitKernelSYCL(ComputeBenchmark): |
232 | | - def __init__(self, bench, ioq): |
| 259 | +class SubmitKernel(ComputeBenchmark): |
| 260 | + def __init__(self, bench, runtime: RUNTIMES, ioq, measure_completion=0): |
233 | 261 | self.ioq = ioq |
234 | | - super().__init__(bench, "api_overhead_benchmark_sycl", "SubmitKernel") |
| 262 | + self.runtime = runtime |
| 263 | + self.measure_completion = measure_completion |
| 264 | + super().__init__( |
| 265 | + bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel" |
| 266 | + ) |
235 | 267 |
|
236 | 268 | def name(self): |
237 | 269 | order = "in order" if self.ioq else "out of order" |
238 | | - return f"api_overhead_benchmark_sycl SubmitKernel {order}" |
| 270 | + completion_str = " with measure completion" if self.measure_completion else "" |
| 271 | + return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}" |
239 | 272 |
|
240 | 273 | def explicit_group(self): |
241 | | - return "SubmitKernel" |
242 | | - |
243 | | - def bin_args(self) -> list[str]: |
244 | | - return [ |
245 | | - f"--Ioq={self.ioq}", |
246 | | - "--DiscardEvents=0", |
247 | | - "--MeasureCompletion=0", |
248 | | - "--iterations=100000", |
249 | | - "--Profiling=0", |
250 | | - "--NumKernels=10", |
251 | | - "--KernelExecTime=1", |
252 | | - ] |
253 | | - |
254 | | - def description(self) -> str: |
255 | | - order = "in-order" if self.ioq else "out-of-order" |
256 | 274 | return ( |
257 | | - f"Measures CPU time overhead of submitting {order} kernels through SYCL API." |
258 | | - "Uses 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time." |
259 | | - ) |
260 | | - |
261 | | - |
262 | | -class SubmitKernelUR(ComputeBenchmark): |
263 | | - def __init__(self, bench, ioq, measureCompletion): |
264 | | - self.ioq = ioq |
265 | | - self.measureCompletion = measureCompletion |
266 | | - super().__init__(bench, "api_overhead_benchmark_ur", "SubmitKernel") |
267 | | - |
268 | | - def name(self): |
269 | | - order = "in order" if self.ioq else "out of order" |
270 | | - return f"api_overhead_benchmark_ur SubmitKernel {order}" + ( |
271 | | - " with measure completion" if self.measureCompletion else "" |
| 275 | + "SubmitKernel" |
| 276 | + if self.measure_completion == 0 |
| 277 | + else "SubmitKernel With Completion" |
272 | 278 | ) |
273 | 279 |
|
274 | | - def explicit_group(self): |
275 | | - return "SubmitKernel" |
276 | | - |
277 | 280 | def description(self) -> str: |
278 | 281 | order = "in-order" if self.ioq else "out-of-order" |
279 | | - completion = "including" if self.measureCompletion else "excluding" |
280 | | - return ( |
281 | | - f"Measures CPU time overhead of submitting {order} kernels through Unified Runtime API, " |
282 | | - f"{completion} kernel completion time. Uses 10 simple kernels with minimal execution time " |
283 | | - f"to isolate API overhead." |
284 | | - ) |
| 282 | + runtime_name = runtime_to_name(self.runtime) |
285 | 283 |
|
286 | | - def bin_args(self) -> list[str]: |
287 | | - return [ |
288 | | - f"--Ioq={self.ioq}", |
289 | | - "--DiscardEvents=0", |
290 | | - f"--MeasureCompletion={self.measureCompletion}", |
291 | | - "--iterations=100000", |
292 | | - "--Profiling=0", |
293 | | - "--NumKernels=10", |
294 | | - "--KernelExecTime=1", |
295 | | - ] |
296 | | - |
297 | | - |
298 | | -class SubmitKernelL0(ComputeBenchmark): |
299 | | - def __init__(self, bench, ioq): |
300 | | - self.ioq = ioq |
301 | | - super().__init__(bench, "api_overhead_benchmark_l0", "SubmitKernel") |
302 | | - |
303 | | - def name(self): |
304 | | - order = "in order" if self.ioq else "out of order" |
305 | | - return f"api_overhead_benchmark_l0 SubmitKernel {order}" |
| 284 | + completion_desc = "" |
| 285 | + if self.runtime == RUNTIMES.UR: |
| 286 | + completion_desc = f", {'including' if self.measure_completion else 'excluding'} kernel completion time" |
306 | 287 |
|
307 | | - def explicit_group(self): |
308 | | - return "SubmitKernel" |
| 288 | + l0_specific = "" |
| 289 | + if self.runtime == RUNTIMES.LEVEL_ZERO: |
| 290 | + l0_specific = " Uses immediate command lists" |
309 | 291 |
|
310 | | - def description(self) -> str: |
311 | | - order = "in-order" if self.ioq else "out-of-order" |
312 | 292 | return ( |
313 | | - f"Measures CPU time overhead of submitting {order} kernels through Level Zero API. " |
314 | | - f"Uses immediate command lists with 10 minimal kernels to isolate submission overhead " |
315 | | - f"from execution time." |
| 293 | + f"Measures CPU time overhead of submitting {order} kernels through {runtime_name} API{completion_desc}. " |
| 294 | + f"Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time. {l0_specific}" |
316 | 295 | ) |
317 | 296 |
|
318 | 297 | def bin_args(self) -> list[str]: |
319 | 298 | return [ |
320 | 299 | f"--Ioq={self.ioq}", |
321 | 300 | "--DiscardEvents=0", |
322 | | - "--MeasureCompletion=0", |
| 301 | + f"--MeasureCompletion={self.measure_completion}", |
323 | 302 | "--iterations=100000", |
324 | 303 | "--Profiling=0", |
325 | 304 | "--NumKernels=10", |
@@ -521,12 +500,6 @@ def bin_args(self) -> list[str]: |
521 | 500 | ] |
522 | 501 |
|
523 | 502 |
|
524 | | -class RUNTIMES(Enum): |
525 | | - SYCL = "sycl" |
526 | | - LEVEL_ZERO = "l0" |
527 | | - UR = "ur" |
528 | | - |
529 | | - |
530 | 503 | class GraphApiSinKernelGraph(ComputeBenchmark): |
531 | 504 | def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels): |
532 | 505 | self.withGraphs = withGraphs |
|
0 commit comments