@@ -53,7 +53,7 @@ def git_url(self) -> str:
5353 return "https://github.com/intel/compute-benchmarks.git"
5454
5555 def git_hash (self ) -> str :
56- return "c9e135d4f26dd6badd83009f92f25d6285fc1e21 "
56+ return "4995560017559849a519e58978a0afdd55903e15 "
5757
5858 def setup (self ) -> None :
5959 if options .sycl is None :
@@ -173,6 +173,9 @@ def benchmarks(self) -> list[Benchmark]:
173173 # See SubmitKernel.enabled()
174174 long_kernel_exec_time_ooo = [20 , 200 ]
175175
176+ # The Combo Profiler is available only for selected sycl benchmarks
177+ profiler_types = ["timer" , "cpuCounter" ]
178+
176179 for runtime in list (RUNTIMES ):
177180 # Add SubmitKernel benchmarks using loops
178181 for in_order_queue in [0 , 1 ]:
@@ -184,14 +187,16 @@ def benchmarks(self) -> list[Benchmark]:
184187 else long_kernel_exec_time_ooo
185188 )
186189 for kernel_exec_time in [1 , * long_kernel_exec_time ]:
187- benches .append (
188- SubmitKernel (
189- self ,
190- runtime ,
191- in_order_queue ,
192- measure_completion ,
193- use_events ,
194- kernel_exec_time ,
190+ for profiler_type in profiler_types :
191+ benches .append (
192+ SubmitKernel (
193+ self ,
194+ runtime ,
195+ in_order_queue ,
196+ measure_completion ,
197+ use_events ,
198+ kernel_exec_time ,
199+ profiler_type ,
195200 )
196201 )
197202
@@ -203,51 +208,57 @@ def benchmarks(self) -> list[Benchmark]:
203208 )
204209
205210 # Add ULLS benchmarks
206- benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 ))
211+ for profiler_type in profiler_types :
212+ benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 , profiler_type ))
207213 benches .append (UllsKernelSwitch (self , runtime , 8 , 200 , 0 , 0 , 1 , 1 ))
208214
209215 # Add GraphApiSubmitGraph benchmarks
210216 for in_order_queue in [0 , 1 ]:
211- benches .append (
212- GraphApiSubmitGraph (
213- self ,
214- runtime ,
215- in_order_queue ,
216- self .submit_graph_num_kernels [- 1 ],
217- 0 ,
218- useEvents = 0 ,
219- useHostTasks = 1 ,
217+ for profiler_type in profiler_types :
218+ benches .append (
219+ GraphApiSubmitGraph (
220+ self ,
221+ runtime ,
222+ in_order_queue ,
223+ self .submit_graph_num_kernels [- 1 ],
224+ 0 ,
225+ profiler_type ,
226+ useEvents = 0 ,
227+ useHostTasks = 1 ,
228+ )
220229 )
221- )
222230 for num_kernels in self .submit_graph_num_kernels :
223231 for measure_completion_time in [0 , 1 ]:
224232 for use_events in [0 , 1 ]:
225- benches .append (
226- GraphApiSubmitGraph (
227- self ,
228- runtime ,
229- in_order_queue ,
230- num_kernels ,
231- measure_completion_time ,
232- use_events ,
233- useHostTasks = 0 ,
233+ for profiler_type in profiler_types :
234+ benches .append (
235+ GraphApiSubmitGraph (
236+ self ,
237+ runtime ,
238+ in_order_queue ,
239+ num_kernels ,
240+ measure_completion_time ,
241+ profiler_type ,
242+ use_events ,
243+ useHostTasks = 0 ,
244+ )
234245 )
235- )
236246
237247 # Add other benchmarks
238248 benches += [
239- QueueInOrderMemcpy (self , 0 , "Device" , "Device" , 1024 ),
240- QueueInOrderMemcpy (self , 0 , "Host" , "Device" , 1024 ),
241- QueueMemcpy (self , "Device" , "Device" , 1024 ),
242249 StreamMemory (self , "Triad" , 10 * 1024 , "Device" ),
243- ExecImmediateCopyQueue (self , 0 , 1 , "Device" , "Device" , 1024 ),
244- ExecImmediateCopyQueue (self , 1 , 1 , "Device" , "Host" , 1024 ),
245250 VectorSum (self ),
246251 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 0 , "Gromacs" ),
247252 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 1 , "Gromacs" ),
248253 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 0 , "Llama" ),
249254 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 1 , "Llama" ),
250255 ]
256+ for profiler_type in profiler_types :
257+ benches .append (QueueInOrderMemcpy (self , 0 , "Device" , "Device" , 1024 , profiler_type ))
258+ benches .append (QueueInOrderMemcpy (self , 0 , "Host" , "Device" , 1024 , profiler_type ))
259+ benches .append (QueueMemcpy (self , "Device" , "Device" , 1024 , profiler_type ))
260+ benches .append (ExecImmediateCopyQueue (self , 0 , 1 , "Device" , "Device" , 1024 , profiler_type ))
261+ benches .append (ExecImmediateCopyQueue (self , 1 , 1 , "Device" , "Host" , 1024 , profiler_type ))
251262
252263 # Add UR-specific benchmarks
253264 benches += [
@@ -295,12 +306,13 @@ def parse_unit_type(compute_unit):
295306
296307
297308class ComputeBenchmark (Benchmark ):
298- def __init__ (self , bench , name , test , runtime : RUNTIMES = None ):
309+ def __init__ (self , bench , name , test , runtime : RUNTIMES = None , profiler_type : str = "" ):
299310 super ().__init__ (bench .directory , bench )
300311 self .bench = bench
301312 self .bench_name = name
302313 self .test = test
303314 self .runtime = runtime
315+ self .profiler_type = profiler_type
304316
305317 def supported_runtimes (self ) -> list [RUNTIMES ]:
306318 """Base runtimes supported by this benchmark, can be overridden."""
@@ -428,14 +440,15 @@ def __init__(
428440 MeasureCompletion = 0 ,
429441 UseEvents = 0 ,
430442 KernelExecTime = 1 ,
443+ profiler_type = ""
431444 ):
432445 self .ioq = ioq
433446 self .MeasureCompletion = MeasureCompletion
434447 self .UseEvents = UseEvents
435448 self .KernelExecTime = KernelExecTime
436449 self .NumKernels = 10
437450 super ().__init__ (
438- bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel" , runtime
451+ bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel" , runtime , profiler_type
439452 )
440453
441454 def supported_runtimes (self ) -> list [RUNTIMES ]:
@@ -504,7 +517,7 @@ def range(self) -> tuple[float, float]:
504517 return (0.0 , None )
505518
506519 def bin_args (self ) -> list [str ]:
507- return [
520+ bin_args = [
508521 f"--Ioq={ self .ioq } " ,
509522 f"--MeasureCompletion={ self .MeasureCompletion } " ,
510523 "--iterations=100000" ,
@@ -513,6 +526,9 @@ def bin_args(self) -> list[str]:
513526 f"--KernelExecTime={ self .KernelExecTime } " ,
514527 f"--UseEvents={ self .UseEvents } " ,
515528 ]
529+ if self .runtime == RUNTIMES .SYCL :
530+ bin_args .append (f"--profilerType={ self .profiler_type } " )
531+ return bin_args
516532
517533 def get_metadata (self ) -> dict [str , BenchmarkMetadata ]:
518534 metadata_dict = super ().get_metadata ()
@@ -532,13 +548,13 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
532548
533549
534550class ExecImmediateCopyQueue (ComputeBenchmark ):
535- def __init__ (self , bench , ioq , isCopyOnly , source , destination , size ):
551+ def __init__ (self , bench , ioq , isCopyOnly , source , destination , size , profiler_type ):
536552 self .ioq = ioq
537553 self .isCopyOnly = isCopyOnly
538554 self .source = source
539555 self .destination = destination
540556 self .size = size
541- super ().__init__ (bench , "api_overhead_benchmark_sycl" , "ExecImmediateCopyQueue" )
557+ super ().__init__ (bench , "api_overhead_benchmark_sycl" , "ExecImmediateCopyQueue" , profiler_type = profiler_type )
542558
543559 def name (self ):
544560 order = "in order" if self .ioq else "out of order"
@@ -569,16 +585,17 @@ def bin_args(self) -> list[str]:
569585 f"--dst={ self .destination } " ,
570586 f"--size={ self .size } " ,
571587 "--withCopyOffload=0" ,
588+ f"--profilerType={ self .profiler_type } " ,
572589 ]
573590
574591
575592class QueueInOrderMemcpy (ComputeBenchmark ):
576- def __init__ (self , bench , isCopyOnly , source , destination , size ):
593+ def __init__ (self , bench , isCopyOnly , source , destination , size , profiler_type ):
577594 self .isCopyOnly = isCopyOnly
578595 self .source = source
579596 self .destination = destination
580597 self .size = size
581- super ().__init__ (bench , "memory_benchmark_sycl" , "QueueInOrderMemcpy" )
598+ super ().__init__ (bench , "memory_benchmark_sycl" , "QueueInOrderMemcpy" , profiler_type = profiler_type )
582599
583600 def name (self ):
584601 return f"memory_benchmark_sycl QueueInOrderMemcpy from { self .source } to { self .destination } , size { self .size } "
@@ -605,15 +622,16 @@ def bin_args(self) -> list[str]:
605622 f"--size={ self .size } " ,
606623 "--count=100" ,
607624 "--withCopyOffload=0" ,
625+ f"--profilerType={ self .profiler_type } "
608626 ]
609627
610628
611629class QueueMemcpy (ComputeBenchmark ):
612- def __init__ (self , bench , source , destination , size ):
630+ def __init__ (self , bench , source , destination , size , profiler_type ):
613631 self .source = source
614632 self .destination = destination
615633 self .size = size
616- super ().__init__ (bench , "memory_benchmark_sycl" , "QueueMemcpy" )
634+ super ().__init__ (bench , "memory_benchmark_sycl" , "QueueMemcpy" , profiler_type = profiler_type )
617635
618636 def name (self ):
619637 return f"memory_benchmark_sycl QueueMemcpy from { self .source } to { self .destination } , size { self .size } "
@@ -636,6 +654,7 @@ def bin_args(self) -> list[str]:
636654 f"--sourcePlacement={ self .source } " ,
637655 f"--destinationPlacement={ self .destination } " ,
638656 f"--size={ self .size } " ,
657+ f"--profilerType={ self .profiler_type } "
639658 ]
640659
641660
@@ -858,6 +877,7 @@ def __init__(
858877 inOrderQueue ,
859878 numKernels ,
860879 measureCompletionTime ,
880+ profiler_type ,
861881 useEvents ,
862882 useHostTasks ,
863883 ):
@@ -873,7 +893,7 @@ def __init__(
873893 self .use_events_str = f" with events" if self .useEvents else ""
874894 self .host_tasks_str = f" use host tasks" if self .useHostTasks else ""
875895 super ().__init__ (
876- bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" , runtime
896+ bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" , runtime , profiler_type
877897 )
878898
879899 def explicit_group (self ):
@@ -901,7 +921,7 @@ def get_tags(self):
901921 ]
902922
903923 def bin_args (self ) -> list [str ]:
904- return [
924+ bin_args = [
905925 "--iterations=10000" ,
906926 f"--NumKernels={ self .numKernels } " ,
907927 f"--MeasureCompletionTime={ self .measureCompletionTime } " ,
@@ -912,14 +932,17 @@ def bin_args(self) -> list[str]:
912932 "--UseExplicit=0" ,
913933 f"--UseHostTasks={ self .useHostTasks } " ,
914934 ]
935+ if self .runtime == RUNTIMES .SYCL :
936+ bin_args .append (f"--profilerType={ self .profiler_type } " )
937+ return bin_args
915938
916939
917940class UllsEmptyKernel (ComputeBenchmark ):
918- def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs ):
941+ def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs , profiler_type ):
919942 self .wgc = wgc
920943 self .wgs = wgs
921944 super ().__init__ (
922- bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" , runtime
945+ bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" , runtime , profiler_type
923946 )
924947
925948 def supported_runtimes (self ) -> list [RUNTIMES ]:
@@ -943,11 +966,14 @@ def get_tags(self):
943966 return [runtime_to_tag_name (self .runtime ), "micro" , "latency" , "submit" ]
944967
945968 def bin_args (self ) -> list [str ]:
946- return [
969+ bin_args = [
947970 "--iterations=10000" ,
948971 f"--wgs={ self .wgs } " ,
949972 f"--wgc={ self .wgc } " ,
950973 ]
974+ if self .runtime == RUNTIMES .SYCL :
975+ bin_args .append (f"--profilerType={ self .profiler_type } " )
976+ return bin_args
951977
952978
953979class UllsKernelSwitch (ComputeBenchmark ):
0 commit comments