@@ -54,7 +54,7 @@ def git_url(self) -> str:
5454 return "https://github.com/intel/compute-benchmarks.git"
5555
5656 def git_hash (self ) -> str :
57- return "c9e135d4f26dd6badd83009f92f25d6285fc1e21 "
57+ return "4995560017559849a519e58978a0afdd55903e15 "
5858
5959 def setup (self ) -> None :
6060 if options .sycl is None :
@@ -177,6 +177,9 @@ def benchmarks(self) -> list[Benchmark]:
177177 # See SubmitKernel.enabled()
178178 long_kernel_exec_time_ooo = [20 , 200 ]
179179
180+ # The Combo Profiler is available only for selected sycl benchmarks
181+ profiler_types = ["timer" , "cpuCounter" ]
182+
180183 for runtime in list (RUNTIMES ):
181184 # Add SubmitKernel benchmarks using loops
182185 for in_order_queue in [0 , 1 ]:
@@ -188,16 +191,18 @@ def benchmarks(self) -> list[Benchmark]:
188191 else long_kernel_exec_time_ooo
189192 )
190193 for kernel_exec_time in [1 , * long_kernel_exec_time ]:
191- benches .append (
192- SubmitKernel (
193- self ,
194- runtime ,
195- in_order_queue ,
196- measure_completion ,
197- use_events ,
198- kernel_exec_time ,
194+ for profiler_type in profiler_types :
195+ benches .append (
196+ SubmitKernel (
197+ self ,
198+ runtime ,
199+ in_order_queue ,
200+ measure_completion ,
201+ use_events ,
202+ kernel_exec_time ,
203+ profiler_type ,
204+ )
199205 )
200- )
201206
202207 # Add SinKernelGraph benchmarks
203208 for with_graphs in [0 , 1 ]:
@@ -207,51 +212,69 @@ def benchmarks(self) -> list[Benchmark]:
207212 )
208213
209214 # Add ULLS benchmarks
210- benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 ))
215+ for profiler_type in profiler_types :
216+ benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 , profiler_type ))
211217 benches .append (UllsKernelSwitch (self , runtime , 8 , 200 , 0 , 0 , 1 , 1 ))
212218
213219 # Add GraphApiSubmitGraph benchmarks
214220 for in_order_queue in [0 , 1 ]:
215- benches .append (
216- GraphApiSubmitGraph (
217- self ,
218- runtime ,
219- in_order_queue ,
220- self .submit_graph_num_kernels [- 1 ],
221- 0 ,
222- useEvents = 0 ,
223- useHostTasks = 1 ,
221+ for profiler_type in profiler_types :
222+ benches .append (
223+ GraphApiSubmitGraph (
224+ self ,
225+ runtime ,
226+ in_order_queue ,
227+ self .submit_graph_num_kernels [- 1 ],
228+ 0 ,
229+ profiler_type ,
230+ useEvents = 0 ,
231+ useHostTasks = 1 ,
232+ )
224233 )
225- )
226234 for num_kernels in self .submit_graph_num_kernels :
227235 for measure_completion_time in [0 , 1 ]:
228236 for use_events in [0 , 1 ]:
229- benches .append (
230- GraphApiSubmitGraph (
231- self ,
232- runtime ,
233- in_order_queue ,
234- num_kernels ,
235- measure_completion_time ,
236- use_events ,
237- useHostTasks = 0 ,
237+ for profiler_type in profiler_types :
238+ benches .append (
239+ GraphApiSubmitGraph (
240+ self ,
241+ runtime ,
242+ in_order_queue ,
243+ num_kernels ,
244+ measure_completion_time ,
245+ profiler_type ,
246+ use_events ,
247+ useHostTasks = 0 ,
248+ )
238249 )
239- )
240250
241251 # Add other benchmarks
242252 benches += [
243- QueueInOrderMemcpy (self , 0 , "Device" , "Device" , 1024 ),
244- QueueInOrderMemcpy (self , 0 , "Host" , "Device" , 1024 ),
245- QueueMemcpy (self , "Device" , "Device" , 1024 ),
246253 StreamMemory (self , "Triad" , 10 * 1024 , "Device" ),
247- ExecImmediateCopyQueue (self , 0 , 1 , "Device" , "Device" , 1024 ),
248- ExecImmediateCopyQueue (self , 1 , 1 , "Device" , "Host" , 1024 ),
249254 VectorSum (self ),
250255 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 0 , "Gromacs" ),
251256 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 1 , "Gromacs" ),
252257 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 0 , "Llama" ),
253258 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 1 , "Llama" ),
254259 ]
260+ for profiler_type in profiler_types :
261+ benches .append (
262+ QueueInOrderMemcpy (self , 0 , "Device" , "Device" , 1024 , profiler_type )
263+ )
264+ benches .append (
265+ QueueInOrderMemcpy (self , 0 , "Host" , "Device" , 1024 , profiler_type )
266+ )
267+ benches .append (QueueMemcpy (self , "Device" , "Device" , 1024 , profiler_type ))
268+ benches .append (
269+ ExecImmediateCopyQueue (
270+ self , 0 , 1 , "Device" , "Device" , 1024 , profiler_type
271+ )
272+ )
273+ benches .append (
274+ ExecImmediateCopyQueue (
275+ self , 1 , 1 , "Device" , "Host" , 1024 , profiler_type
276+ )
277+ )
255278
256279 # Add UR-specific benchmarks
257280 benches += [
@@ -299,12 +322,15 @@ def parse_unit_type(compute_unit):
299322
300323
301324class ComputeBenchmark (Benchmark ):
302- def __init__ (self , bench , name , test , runtime : RUNTIMES = None ):
325+ def __init__ (
326+ self , bench , name , test , runtime : RUNTIMES = None , profiler_type : str = ""
327+ ):
303328 super ().__init__ (bench .directory , bench )
304329 self .bench = bench
305330 self .bench_name = name
306331 self .test = test
307332 self .runtime = runtime
333+ self .profiler_type = profiler_type
308334 # Mandatory per-benchmark iteration counts.
309335 # Subclasses MUST set both `self.iterations_regular` and
310336 # `self.iterations_trace` (positive ints) in their __init__ before
@@ -465,6 +491,7 @@ def __init__(
465491 MeasureCompletion = 0 ,
466492 UseEvents = 0 ,
467493 KernelExecTime = 1 ,
494+ profiler_type = "" ,
468495 ):
469496 self .ioq = ioq
470497 self .MeasureCompletion = MeasureCompletion
@@ -475,7 +502,11 @@ def __init__(
475502 self .iterations_regular = 100000
476503 self .iterations_trace = 10
477504 super ().__init__ (
478- bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel" , runtime
505+ bench ,
506+ f"api_overhead_benchmark_{ runtime .value } " ,
507+ "SubmitKernel" ,
508+ runtime ,
509+ profiler_type ,
479510 )
480511
481512 def supported_runtimes (self ) -> list [RUNTIMES ]:
@@ -545,7 +576,7 @@ def range(self) -> tuple[float, float]:
545576
546577 def bin_args (self , run_trace : TracingType = TracingType .NONE ) -> list [str ]:
547578 iters = self .get_iters (run_trace )
548- return [
579+ bin_args = [
549580 f"--iterations={ iters } " ,
550581 f"--Ioq={ self .ioq } " ,
551582 f"--MeasureCompletion={ self .MeasureCompletion } " ,
@@ -554,6 +585,9 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
554585 f"--KernelExecTime={ self .KernelExecTime } " ,
555586 f"--UseEvents={ self .UseEvents } " ,
556587 ]
588+ if self .runtime == RUNTIMES .SYCL :
589+ bin_args .append (f"--profilerType={ self .profiler_type } " )
590+ return bin_args
557591
558592 def get_metadata (self ) -> dict [str , BenchmarkMetadata ]:
559593 metadata_dict = super ().get_metadata ()
@@ -573,7 +607,9 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
573607
574608
575609class ExecImmediateCopyQueue (ComputeBenchmark ):
576- def __init__ (self , bench , ioq , isCopyOnly , source , destination , size ):
610+ def __init__ (
611+ self , bench , ioq , isCopyOnly , source , destination , size , profiler_type
612+ ):
577613 self .ioq = ioq
578614 self .isCopyOnly = isCopyOnly
579615 self .source = source
@@ -582,7 +618,12 @@ def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
582618 # iterations per bin_args: --iterations=100000
583619 self .iterations_regular = 100000
584620 self .iterations_trace = 10
585- super ().__init__ (bench , "api_overhead_benchmark_sycl" , "ExecImmediateCopyQueue" )
621+ super ().__init__ (
622+ bench ,
623+ "api_overhead_benchmark_sycl" ,
624+ "ExecImmediateCopyQueue" ,
625+ profiler_type = profiler_type ,
626+ )
586627
587628 def name (self ):
588629 order = "in order" if self .ioq else "out of order"
@@ -614,19 +655,25 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
614655 f"--dst={ self .destination } " ,
615656 f"--size={ self .size } " ,
616657 "--withCopyOffload=0" ,
658+ f"--profilerType={ self .profiler_type } " ,
617659 ]
618660
619661
620662class QueueInOrderMemcpy (ComputeBenchmark ):
621- def __init__ (self , bench , isCopyOnly , source , destination , size ):
663+ def __init__ (self , bench , isCopyOnly , source , destination , size , profiler_type ):
622664 self .isCopyOnly = isCopyOnly
623665 self .source = source
624666 self .destination = destination
625667 self .size = size
626668 # iterations per bin_args: --iterations=10000
627669 self .iterations_regular = 10000
628670 self .iterations_trace = 10
629- super ().__init__ (bench , "memory_benchmark_sycl" , "QueueInOrderMemcpy" )
671+ super ().__init__ (
672+ bench ,
673+ "memory_benchmark_sycl" ,
674+ "QueueInOrderMemcpy" ,
675+ profiler_type = profiler_type ,
676+ )
630677
631678 def name (self ):
632679 return f"memory_benchmark_sycl QueueInOrderMemcpy from { self .source } to { self .destination } , size { self .size } "
@@ -654,18 +701,21 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
654701 f"--size={ self .size } " ,
655702 "--count=100" ,
656703 "--withCopyOffload=0" ,
704+ f"--profilerType={ self .profiler_type } " ,
657705 ]
658706
659707
660708class QueueMemcpy (ComputeBenchmark ):
661- def __init__ (self , bench , source , destination , size ):
709+ def __init__ (self , bench , source , destination , size , profiler_type ):
662710 self .source = source
663711 self .destination = destination
664712 self .size = size
665713 # iterations per bin_args: --iterations=10000
666714 self .iterations_regular = 10000
667715 self .iterations_trace = 10
668- super ().__init__ (bench , "memory_benchmark_sycl" , "QueueMemcpy" )
716+ super ().__init__ (
717+ bench , "memory_benchmark_sycl" , "QueueMemcpy" , profiler_type = profiler_type
718+ )
669719
670720 def name (self ):
671721 return f"memory_benchmark_sycl QueueMemcpy from { self .source } to { self .destination } , size { self .size } "
@@ -689,6 +739,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
689739 f"--sourcePlacement={ self .source } " ,
690740 f"--destinationPlacement={ self .destination } " ,
691741 f"--size={ self .size } " ,
742+ f"--profilerType={ self .profiler_type } " ,
692743 ]
693744
694745
@@ -927,6 +978,7 @@ def __init__(
927978 inOrderQueue ,
928979 numKernels ,
929980 measureCompletionTime ,
981+ profiler_type ,
930982 useEvents ,
931983 useHostTasks ,
932984 ):
@@ -945,7 +997,11 @@ def __init__(
945997 self .iterations_regular = 10000
946998 self .iterations_trace = 10
947999 super ().__init__ (
948- bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" , runtime
1000+ bench ,
1001+ f"graph_api_benchmark_{ runtime .value } " ,
1002+ "SubmitGraph" ,
1003+ runtime ,
1004+ profiler_type ,
9491005 )
9501006
9511007 def explicit_group (self ):
@@ -974,7 +1030,7 @@ def get_tags(self):
9741030
9751031 def bin_args (self , run_trace : TracingType = TracingType .NONE ) -> list [str ]:
9761032 iters = self .get_iters (run_trace )
977- return [
1033+ bin_args = [
9781034 f"--iterations={ iters } " ,
9791035 f"--NumKernels={ self .numKernels } " ,
9801036 f"--MeasureCompletionTime={ self .measureCompletionTime } " ,
@@ -985,17 +1041,24 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
9851041 "--UseExplicit=0" ,
9861042 f"--UseHostTasks={ self .useHostTasks } " ,
9871043 ]
1044+ if self .runtime == RUNTIMES .SYCL :
1045+ bin_args .append (f"--profilerType={ self .profiler_type } " )
1046+ return bin_args
9881047
9891048
9901049class UllsEmptyKernel (ComputeBenchmark ):
991- def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs ):
1050+ def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs , profiler_type ):
9921051 self .wgc = wgc
9931052 self .wgs = wgs
9941053 # iterations per bin_args: --iterations=10000
9951054 self .iterations_regular = 10000
9961055 self .iterations_trace = 10
9971056 super ().__init__ (
998- bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" , runtime
1057+ bench ,
1058+ f"ulls_benchmark_{ runtime .value } " ,
1059+ "EmptyKernel" ,
1060+ runtime ,
1061+ profiler_type ,
9991062 )
10001063
10011064 def supported_runtimes (self ) -> list [RUNTIMES ]:
@@ -1020,11 +1083,14 @@ def get_tags(self):
10201083
10211084 def bin_args (self , run_trace : TracingType = TracingType .NONE ) -> list [str ]:
10221085 iters = self .get_iters (run_trace )
1023- return [
1086+ bin_args = [
10241087 f"--iterations={ iters } " ,
10251088 f"--wgs={ self .wgs } " ,
10261089 f"--wgc={ self .wgc } " ,
10271090 ]
1091+ if self .runtime == RUNTIMES .SYCL :
1092+ bin_args .append (f"--profilerType={ self .profiler_type } " )
1093+ return bin_args
10281094
10291095
10301096class UllsKernelSwitch (ComputeBenchmark ):
0 commit comments