@@ -53,7 +53,7 @@ def git_url(self) -> str:
5353 return "https://github.com/intel/compute-benchmarks.git"
5454
5555 def git_hash (self ) -> str :
56- return "c9e135d4f26dd6badd83009f92f25d6285fc1e21 "
56+ return "4995560017559849a519e58978a0afdd55903e15 "
5757
5858 def setup (self ) -> None :
5959 if options .sycl is None :
@@ -173,6 +173,9 @@ def benchmarks(self) -> list[Benchmark]:
173173 # See SubmitKernel.enabled()
174174 long_kernel_exec_time_ooo = [20 , 200 ]
175175
176+ # The Combo Profiler is available only for selected sycl benchmarks
177+ profiler_types = ["timer" , "cpuCounter" ]
178+
176179 for runtime in list (RUNTIMES ):
177180 # Add SubmitKernel benchmarks using loops
178181 for in_order_queue in [0 , 1 ]:
@@ -184,16 +187,18 @@ def benchmarks(self) -> list[Benchmark]:
184187 else long_kernel_exec_time_ooo
185188 )
186189 for kernel_exec_time in [1 , * long_kernel_exec_time ]:
187- benches .append (
188- SubmitKernel (
189- self ,
190- runtime ,
191- in_order_queue ,
192- measure_completion ,
193- use_events ,
194- kernel_exec_time ,
190+ for profiler_type in profiler_types :
191+ benches .append (
192+ SubmitKernel (
193+ self ,
194+ runtime ,
195+ in_order_queue ,
196+ measure_completion ,
197+ use_events ,
198+ kernel_exec_time ,
199+ profiler_type ,
200+ )
195201 )
196- )
197202
198203 # Add SinKernelGraph benchmarks
199204 for with_graphs in [0 , 1 ]:
@@ -203,51 +208,69 @@ def benchmarks(self) -> list[Benchmark]:
203208 )
204209
205210 # Add ULLS benchmarks
206- benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 ))
211+ for profiler_type in profiler_types :
212+ benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 , profiler_type ))
207213 benches .append (UllsKernelSwitch (self , runtime , 8 , 200 , 0 , 0 , 1 , 1 ))
208214
209215 # Add GraphApiSubmitGraph benchmarks
210216 for in_order_queue in [0 , 1 ]:
211- benches .append (
212- GraphApiSubmitGraph (
213- self ,
214- runtime ,
215- in_order_queue ,
216- self .submit_graph_num_kernels [- 1 ],
217- 0 ,
218- useEvents = 0 ,
219- useHostTasks = 1 ,
217+ for profiler_type in profiler_types :
218+ benches .append (
219+ GraphApiSubmitGraph (
220+ self ,
221+ runtime ,
222+ in_order_queue ,
223+ self .submit_graph_num_kernels [- 1 ],
224+ 0 ,
225+ profiler_type ,
226+ useEvents = 0 ,
227+ useHostTasks = 1 ,
228+ )
220229 )
221- )
222230 for num_kernels in self .submit_graph_num_kernels :
223231 for measure_completion_time in [0 , 1 ]:
224232 for use_events in [0 , 1 ]:
225- benches .append (
226- GraphApiSubmitGraph (
227- self ,
228- runtime ,
229- in_order_queue ,
230- num_kernels ,
231- measure_completion_time ,
232- use_events ,
233- useHostTasks = 0 ,
233+ for profiler_type in profiler_types :
234+ benches .append (
235+ GraphApiSubmitGraph (
236+ self ,
237+ runtime ,
238+ in_order_queue ,
239+ num_kernels ,
240+ measure_completion_time ,
241+ profiler_type ,
242+ use_events ,
243+ useHostTasks = 0 ,
244+ )
234245 )
235- )
236246
237247 # Add other benchmarks
238248 benches += [
239- QueueInOrderMemcpy (self , 0 , "Device" , "Device" , 1024 ),
240- QueueInOrderMemcpy (self , 0 , "Host" , "Device" , 1024 ),
241- QueueMemcpy (self , "Device" , "Device" , 1024 ),
242249 StreamMemory (self , "Triad" , 10 * 1024 , "Device" ),
243- ExecImmediateCopyQueue (self , 0 , 1 , "Device" , "Device" , 1024 ),
244- ExecImmediateCopyQueue (self , 1 , 1 , "Device" , "Host" , 1024 ),
245250 VectorSum (self ),
246251 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 0 , "Gromacs" ),
247252 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 1 , "Gromacs" ),
248253 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 0 , "Llama" ),
249254 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 1 , "Llama" ),
250255 ]
256+ for profiler_type in profiler_types :
257+ benches .append (
258+ QueueInOrderMemcpy (self , 0 , "Device" , "Device" , 1024 , profiler_type )
259+ )
260+ benches .append (
261+ QueueInOrderMemcpy (self , 0 , "Host" , "Device" , 1024 , profiler_type )
262+ )
263+ benches .append (QueueMemcpy (self , "Device" , "Device" , 1024 , profiler_type ))
264+ benches .append (
265+ ExecImmediateCopyQueue (
266+ self , 0 , 1 , "Device" , "Device" , 1024 , profiler_type
267+ )
268+ )
269+ benches .append (
270+ ExecImmediateCopyQueue (
271+ self , 1 , 1 , "Device" , "Host" , 1024 , profiler_type
272+ )
273+ )
251274
252275 # Add UR-specific benchmarks
253276 benches += [
@@ -295,12 +318,15 @@ def parse_unit_type(compute_unit):
295318
296319
297320class ComputeBenchmark (Benchmark ):
298- def __init__ (self , bench , name , test , runtime : RUNTIMES = None ):
321+ def __init__ (
322+ self , bench , name , test , runtime : RUNTIMES = None , profiler_type : str = ""
323+ ):
299324 super ().__init__ (bench .directory , bench )
300325 self .bench = bench
301326 self .bench_name = name
302327 self .test = test
303328 self .runtime = runtime
329+ self .profiler_type = profiler_type
304330
305331 def supported_runtimes (self ) -> list [RUNTIMES ]:
306332 """Base runtimes supported by this benchmark, can be overridden."""
@@ -428,14 +454,19 @@ def __init__(
428454 MeasureCompletion = 0 ,
429455 UseEvents = 0 ,
430456 KernelExecTime = 1 ,
457+ profiler_type = "" ,
431458 ):
432459 self .ioq = ioq
433460 self .MeasureCompletion = MeasureCompletion
434461 self .UseEvents = UseEvents
435462 self .KernelExecTime = KernelExecTime
436463 self .NumKernels = 10
437464 super ().__init__ (
438- bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel" , runtime
465+ bench ,
466+ f"api_overhead_benchmark_{ runtime .value } " ,
467+ "SubmitKernel" ,
468+ runtime ,
469+ profiler_type ,
439470 )
440471
441472 def supported_runtimes (self ) -> list [RUNTIMES ]:
@@ -504,7 +535,7 @@ def range(self) -> tuple[float, float]:
504535 return (0.0 , None )
505536
506537 def bin_args (self ) -> list [str ]:
507- return [
538+ bin_args = [
508539 f"--Ioq={ self .ioq } " ,
509540 f"--MeasureCompletion={ self .MeasureCompletion } " ,
510541 "--iterations=100000" ,
@@ -513,6 +544,9 @@ def bin_args(self) -> list[str]:
513544 f"--KernelExecTime={ self .KernelExecTime } " ,
514545 f"--UseEvents={ self .UseEvents } " ,
515546 ]
547+ if self .runtime == RUNTIMES .SYCL :
548+ bin_args .append (f"--profilerType={ self .profiler_type } " )
549+ return bin_args
516550
517551 def get_metadata (self ) -> dict [str , BenchmarkMetadata ]:
518552 metadata_dict = super ().get_metadata ()
@@ -532,13 +566,20 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
532566
533567
534568class ExecImmediateCopyQueue (ComputeBenchmark ):
535- def __init__ (self , bench , ioq , isCopyOnly , source , destination , size ):
569+ def __init__ (
570+ self , bench , ioq , isCopyOnly , source , destination , size , profiler_type
571+ ):
536572 self .ioq = ioq
537573 self .isCopyOnly = isCopyOnly
538574 self .source = source
539575 self .destination = destination
540576 self .size = size
541- super ().__init__ (bench , "api_overhead_benchmark_sycl" , "ExecImmediateCopyQueue" )
577+ super ().__init__ (
578+ bench ,
579+ "api_overhead_benchmark_sycl" ,
580+ "ExecImmediateCopyQueue" ,
581+ profiler_type = profiler_type ,
582+ )
542583
543584 def name (self ):
544585 order = "in order" if self .ioq else "out of order"
@@ -569,16 +610,22 @@ def bin_args(self) -> list[str]:
569610 f"--dst={ self .destination } " ,
570611 f"--size={ self .size } " ,
571612 "--withCopyOffload=0" ,
613+ f"--profilerType={ self .profiler_type } " ,
572614 ]
573615
574616
575617class QueueInOrderMemcpy (ComputeBenchmark ):
576- def __init__ (self , bench , isCopyOnly , source , destination , size ):
618+ def __init__ (self , bench , isCopyOnly , source , destination , size , profiler_type ):
577619 self .isCopyOnly = isCopyOnly
578620 self .source = source
579621 self .destination = destination
580622 self .size = size
581- super ().__init__ (bench , "memory_benchmark_sycl" , "QueueInOrderMemcpy" )
623+ super ().__init__ (
624+ bench ,
625+ "memory_benchmark_sycl" ,
626+ "QueueInOrderMemcpy" ,
627+ profiler_type = profiler_type ,
628+ )
582629
583630 def name (self ):
584631 return f"memory_benchmark_sycl QueueInOrderMemcpy from { self .source } to { self .destination } , size { self .size } "
@@ -605,15 +652,18 @@ def bin_args(self) -> list[str]:
605652 f"--size={ self .size } " ,
606653 "--count=100" ,
607654 "--withCopyOffload=0" ,
655+ f"--profilerType={ self .profiler_type } " ,
608656 ]
609657
610658
611659class QueueMemcpy (ComputeBenchmark ):
612- def __init__ (self , bench , source , destination , size ):
660+ def __init__ (self , bench , source , destination , size , profiler_type ):
613661 self .source = source
614662 self .destination = destination
615663 self .size = size
616- super ().__init__ (bench , "memory_benchmark_sycl" , "QueueMemcpy" )
664+ super ().__init__ (
665+ bench , "memory_benchmark_sycl" , "QueueMemcpy" , profiler_type = profiler_type
666+ )
617667
618668 def name (self ):
619669 return f"memory_benchmark_sycl QueueMemcpy from { self .source } to { self .destination } , size { self .size } "
@@ -636,6 +686,7 @@ def bin_args(self) -> list[str]:
636686 f"--sourcePlacement={ self .source } " ,
637687 f"--destinationPlacement={ self .destination } " ,
638688 f"--size={ self .size } " ,
689+ f"--profilerType={ self .profiler_type } " ,
639690 ]
640691
641692
@@ -858,6 +909,7 @@ def __init__(
858909 inOrderQueue ,
859910 numKernels ,
860911 measureCompletionTime ,
912+ profiler_type ,
861913 useEvents ,
862914 useHostTasks ,
863915 ):
@@ -873,7 +925,11 @@ def __init__(
873925 self .use_events_str = f" with events" if self .useEvents else ""
874926 self .host_tasks_str = f" use host tasks" if self .useHostTasks else ""
875927 super ().__init__ (
876- bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" , runtime
928+ bench ,
929+ f"graph_api_benchmark_{ runtime .value } " ,
930+ "SubmitGraph" ,
931+ runtime ,
932+ profiler_type ,
877933 )
878934
879935 def explicit_group (self ):
@@ -901,7 +957,7 @@ def get_tags(self):
901957 ]
902958
903959 def bin_args (self ) -> list [str ]:
904- return [
960+ bin_args = [
905961 "--iterations=10000" ,
906962 f"--NumKernels={ self .numKernels } " ,
907963 f"--MeasureCompletionTime={ self .measureCompletionTime } " ,
@@ -912,14 +968,21 @@ def bin_args(self) -> list[str]:
912968 "--UseExplicit=0" ,
913969 f"--UseHostTasks={ self .useHostTasks } " ,
914970 ]
971+ if self .runtime == RUNTIMES .SYCL :
972+ bin_args .append (f"--profilerType={ self .profiler_type } " )
973+ return bin_args
915974
916975
917976class UllsEmptyKernel (ComputeBenchmark ):
918- def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs ):
977+ def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs , profiler_type ):
919978 self .wgc = wgc
920979 self .wgs = wgs
921980 super ().__init__ (
922- bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" , runtime
981+ bench ,
982+ f"ulls_benchmark_{ runtime .value } " ,
983+ "EmptyKernel" ,
984+ runtime ,
985+ profiler_type ,
923986 )
924987
925988 def supported_runtimes (self ) -> list [RUNTIMES ]:
@@ -943,11 +1006,14 @@ def get_tags(self):
9431006 return [runtime_to_tag_name (self .runtime ), "micro" , "latency" , "submit" ]
9441007
9451008 def bin_args (self ) -> list [str ]:
946- return [
1009+ bin_args = [
9471010 "--iterations=10000" ,
9481011 f"--wgs={ self .wgs } " ,
9491012 f"--wgc={ self .wgc } " ,
9501013 ]
1014+ if self .runtime == RUNTIMES .SYCL :
1015+ bin_args .append (f"--profilerType={ self .profiler_type } " )
1016+ return bin_args
9511017
9521018
9531019class UllsKernelSwitch (ComputeBenchmark ):
0 commit comments