@@ -54,7 +54,7 @@ def git_url(self) -> str:
5454 return "https://github.com/intel/compute-benchmarks.git"
5555
5656 def git_hash (self ) -> str :
57- return "c9e135d4f26dd6badd83009f92f25d6285fc1e21 "
57+ return "4995560017559849a519e58978a0afdd55903e15 "
5858
5959 def setup (self ) -> None :
6060 if options .sycl is None :
@@ -177,6 +177,9 @@ def benchmarks(self) -> list[Benchmark]:
177177 # See SubmitKernel.enabled()
178178 long_kernel_exec_time_ooo = [20 , 200 ]
179179
180+ # The Combo Profiler is available only for selected sycl benchmarks
181+ profiler_types = ["timer" , "cpuCounter" ]
182+
180183 for runtime in list (RUNTIMES ):
181184 # Add SubmitKernel benchmarks using loops
182185 for in_order_queue in [0 , 1 ]:
@@ -188,16 +191,18 @@ def benchmarks(self) -> list[Benchmark]:
188191 else long_kernel_exec_time_ooo
189192 )
190193 for kernel_exec_time in [1 , * long_kernel_exec_time ]:
191- benches .append (
192- SubmitKernel (
193- self ,
194- runtime ,
195- in_order_queue ,
196- measure_completion ,
197- use_events ,
198- kernel_exec_time ,
194+ for profiler_type in profiler_types :
195+ benches .append (
196+ SubmitKernel (
197+ self ,
198+ runtime ,
199+ in_order_queue ,
200+ measure_completion ,
201+ use_events ,
202+ kernel_exec_time ,
203+ profiler_type ,
204+ )
199205 )
200- )
201206
202207 # Add SinKernelGraph benchmarks
203208 for with_graphs in [0 , 1 ]:
@@ -207,51 +212,69 @@ def benchmarks(self) -> list[Benchmark]:
207212 )
208213
209214 # Add ULLS benchmarks
210- benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 ))
215+ for profiler_type in profiler_types :
216+ benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 , profiler_type ))
211217 benches .append (UllsKernelSwitch (self , runtime , 8 , 200 , 0 , 0 , 1 , 1 ))
212218
213219 # Add GraphApiSubmitGraph benchmarks
214220 for in_order_queue in [0 , 1 ]:
215- benches .append (
216- GraphApiSubmitGraph (
217- self ,
218- runtime ,
219- in_order_queue ,
220- self .submit_graph_num_kernels [- 1 ],
221- 0 ,
222- useEvents = 0 ,
223- useHostTasks = 1 ,
221+ for profiler_type in profiler_types :
222+ benches .append (
223+ GraphApiSubmitGraph (
224+ self ,
225+ runtime ,
226+ in_order_queue ,
227+ self .submit_graph_num_kernels [- 1 ],
228+ 0 ,
229+ profiler_type ,
230+ useEvents = 0 ,
231+ useHostTasks = 1 ,
232+ )
224233 )
225- )
226234 for num_kernels in self .submit_graph_num_kernels :
227235 for measure_completion_time in [0 , 1 ]:
228236 for use_events in [0 , 1 ]:
229- benches .append (
230- GraphApiSubmitGraph (
231- self ,
232- runtime ,
233- in_order_queue ,
234- num_kernels ,
235- measure_completion_time ,
236- use_events ,
237- useHostTasks = 0 ,
237+ for profiler_type in profiler_types :
238+ benches .append (
239+ GraphApiSubmitGraph (
240+ self ,
241+ runtime ,
242+ in_order_queue ,
243+ num_kernels ,
244+ measure_completion_time ,
245+ profiler_type ,
246+ use_events ,
247+ useHostTasks = 0 ,
248+ )
238249 )
239- )
240250
241251 # Add other benchmarks
242252 benches += [
243- QueueInOrderMemcpy (self , 0 , "Device" , "Device" , 1024 ),
244- QueueInOrderMemcpy (self , 0 , "Host" , "Device" , 1024 ),
245- QueueMemcpy (self , "Device" , "Device" , 1024 ),
246253 StreamMemory (self , "Triad" , 10 * 1024 , "Device" ),
247- ExecImmediateCopyQueue (self , 0 , 1 , "Device" , "Device" , 1024 ),
248- ExecImmediateCopyQueue (self , 1 , 1 , "Device" , "Host" , 1024 ),
249254 VectorSum (self ),
250255 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 0 , "Gromacs" ),
251256 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 1 , "Gromacs" ),
252257 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 0 , "Llama" ),
253258 GraphApiFinalizeGraph (self , RUNTIMES .SYCL , 1 , "Llama" ),
254259 ]
260+ for profiler_type in profiler_types :
261+ benches .append (
262+ QueueInOrderMemcpy (self , 0 , "Device" , "Device" , 1024 , profiler_type )
263+ )
264+ benches .append (
265+ QueueInOrderMemcpy (self , 0 , "Host" , "Device" , 1024 , profiler_type )
266+ )
267+ benches .append (QueueMemcpy (self , "Device" , "Device" , 1024 , profiler_type ))
268+ benches .append (
269+ ExecImmediateCopyQueue (
270+ self , 0 , 1 , "Device" , "Device" , 1024 , profiler_type
271+ )
272+ )
273+ benches .append (
274+ ExecImmediateCopyQueue (
275+ self , 1 , 1 , "Device" , "Host" , 1024 , profiler_type
276+ )
277+ )
255278
256279 # Add UR-specific benchmarks
257280 benches += [
@@ -299,12 +322,15 @@ def parse_unit_type(compute_unit):
299322
300323
301324class ComputeBenchmark (Benchmark ):
302- def __init__ (self , bench , name , test , runtime : RUNTIMES = None ):
325+ def __init__ (
326+ self , bench , name , test , runtime : RUNTIMES = None , profiler_type : str = ""
327+ ):
303328 super ().__init__ (bench .directory , bench )
304329 self .bench = bench
305330 self .bench_name = name
306331 self .test = test
307332 self .runtime = runtime
333+ self .profiler_type = profiler_type
308334 # Mandatory per-benchmark iteration counts.
309335 # Subclasses MUST set both `self.iterations_regular` and
310336 # `self.iterations_trace` (positive ints) in their __init__ before
@@ -465,6 +491,7 @@ def __init__(
465491 MeasureCompletion = 0 ,
466492 UseEvents = 0 ,
467493 KernelExecTime = 1 ,
494+ profiler_type = "" ,
468495 ):
469496 self .ioq = ioq
470497 self .MeasureCompletion = MeasureCompletion
@@ -475,7 +502,11 @@ def __init__(
475502 self .iterations_regular = 100000
476503 self .iterations_trace = 10
477504 super ().__init__ (
478- bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel" , runtime
505+ bench ,
506+ f"api_overhead_benchmark_{ runtime .value } " ,
507+ "SubmitKernel" ,
508+ runtime ,
509+ profiler_type ,
479510 )
480511
481512 def supported_runtimes (self ) -> list [RUNTIMES ]:
@@ -486,9 +517,14 @@ def enabled(self) -> bool:
486517 # The benchmark instance gets created just to make metadata for these old results
487518 if not super ().enabled ():
488519 return False
489- if "bmg" in options .device_architecture and self .KernelExecTime == 20 :
520+
521+ device_arch = getattr (options , "device_architecture" , "" )
522+ if "bmg" in device_arch and self .KernelExecTime == 20 :
490523 # Disable this benchmark for BMG server, just create metadata
491524 return False
525+ if "bmg" not in device_arch and self .KernelExecTime == 200 :
526+ # Disable KernelExecTime=200 for non-BMG systems, just create metadata
527+ return False
492528 return True
493529
494530 def get_tags (self ):
@@ -545,7 +581,7 @@ def range(self) -> tuple[float, float]:
545581
546582 def bin_args (self , run_trace : TracingType = TracingType .NONE ) -> list [str ]:
547583 iters = self .get_iters (run_trace )
548- return [
584+ bin_args = [
549585 f"--iterations={ iters } " ,
550586 f"--Ioq={ self .ioq } " ,
551587 f"--MeasureCompletion={ self .MeasureCompletion } " ,
@@ -554,6 +590,9 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
554590 f"--KernelExecTime={ self .KernelExecTime } " ,
555591 f"--UseEvents={ self .UseEvents } " ,
556592 ]
593+ if self .runtime == RUNTIMES .SYCL :
594+ bin_args .append (f"--profilerType={ self .profiler_type } " )
595+ return bin_args
557596
558597 def get_metadata (self ) -> dict [str , BenchmarkMetadata ]:
559598 metadata_dict = super ().get_metadata ()
@@ -573,7 +612,9 @@ def get_metadata(self) -> dict[str, BenchmarkMetadata]:
573612
574613
575614class ExecImmediateCopyQueue (ComputeBenchmark ):
576- def __init__ (self , bench , ioq , isCopyOnly , source , destination , size ):
615+ def __init__ (
616+ self , bench , ioq , isCopyOnly , source , destination , size , profiler_type
617+ ):
577618 self .ioq = ioq
578619 self .isCopyOnly = isCopyOnly
579620 self .source = source
@@ -582,7 +623,12 @@ def __init__(self, bench, ioq, isCopyOnly, source, destination, size):
582623 # iterations per bin_args: --iterations=100000
583624 self .iterations_regular = 100000
584625 self .iterations_trace = 10
585- super ().__init__ (bench , "api_overhead_benchmark_sycl" , "ExecImmediateCopyQueue" )
626+ super ().__init__ (
627+ bench ,
628+ "api_overhead_benchmark_sycl" ,
629+ "ExecImmediateCopyQueue" ,
630+ profiler_type = profiler_type ,
631+ )
586632
587633 def name (self ):
588634 order = "in order" if self .ioq else "out of order"
@@ -614,19 +660,25 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
614660 f"--dst={ self .destination } " ,
615661 f"--size={ self .size } " ,
616662 "--withCopyOffload=0" ,
663+ f"--profilerType={ self .profiler_type } " ,
617664 ]
618665
619666
620667class QueueInOrderMemcpy (ComputeBenchmark ):
621- def __init__ (self , bench , isCopyOnly , source , destination , size ):
668+ def __init__ (self , bench , isCopyOnly , source , destination , size , profiler_type ):
622669 self .isCopyOnly = isCopyOnly
623670 self .source = source
624671 self .destination = destination
625672 self .size = size
626673 # iterations per bin_args: --iterations=10000
627674 self .iterations_regular = 10000
628675 self .iterations_trace = 10
629- super ().__init__ (bench , "memory_benchmark_sycl" , "QueueInOrderMemcpy" )
676+ super ().__init__ (
677+ bench ,
678+ "memory_benchmark_sycl" ,
679+ "QueueInOrderMemcpy" ,
680+ profiler_type = profiler_type ,
681+ )
630682
631683 def name (self ):
632684 return f"memory_benchmark_sycl QueueInOrderMemcpy from { self .source } to { self .destination } , size { self .size } "
@@ -654,18 +706,21 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
654706 f"--size={ self .size } " ,
655707 "--count=100" ,
656708 "--withCopyOffload=0" ,
709+ f"--profilerType={ self .profiler_type } " ,
657710 ]
658711
659712
660713class QueueMemcpy (ComputeBenchmark ):
661- def __init__ (self , bench , source , destination , size ):
714+ def __init__ (self , bench , source , destination , size , profiler_type ):
662715 self .source = source
663716 self .destination = destination
664717 self .size = size
665718 # iterations per bin_args: --iterations=10000
666719 self .iterations_regular = 10000
667720 self .iterations_trace = 10
668- super ().__init__ (bench , "memory_benchmark_sycl" , "QueueMemcpy" )
721+ super ().__init__ (
722+ bench , "memory_benchmark_sycl" , "QueueMemcpy" , profiler_type = profiler_type
723+ )
669724
670725 def name (self ):
671726 return f"memory_benchmark_sycl QueueMemcpy from { self .source } to { self .destination } , size { self .size } "
@@ -689,6 +744,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
689744 f"--sourcePlacement={ self .source } " ,
690745 f"--destinationPlacement={ self .destination } " ,
691746 f"--size={ self .size } " ,
747+ f"--profilerType={ self .profiler_type } " ,
692748 ]
693749
694750
@@ -927,6 +983,7 @@ def __init__(
927983 inOrderQueue ,
928984 numKernels ,
929985 measureCompletionTime ,
986+ profiler_type ,
930987 useEvents ,
931988 useHostTasks ,
932989 ):
@@ -945,7 +1002,11 @@ def __init__(
9451002 self .iterations_regular = 10000
9461003 self .iterations_trace = 10
9471004 super ().__init__ (
948- bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" , runtime
1005+ bench ,
1006+ f"graph_api_benchmark_{ runtime .value } " ,
1007+ "SubmitGraph" ,
1008+ runtime ,
1009+ profiler_type ,
9491010 )
9501011
9511012 def explicit_group (self ):
@@ -974,7 +1035,7 @@ def get_tags(self):
9741035
9751036 def bin_args (self , run_trace : TracingType = TracingType .NONE ) -> list [str ]:
9761037 iters = self .get_iters (run_trace )
977- return [
1038+ bin_args = [
9781039 f"--iterations={ iters } " ,
9791040 f"--NumKernels={ self .numKernels } " ,
9801041 f"--MeasureCompletionTime={ self .measureCompletionTime } " ,
@@ -985,17 +1046,24 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
9851046 "--UseExplicit=0" ,
9861047 f"--UseHostTasks={ self .useHostTasks } " ,
9871048 ]
1049+ if self .runtime == RUNTIMES .SYCL :
1050+ bin_args .append (f"--profilerType={ self .profiler_type } " )
1051+ return bin_args
9881052
9891053
9901054class UllsEmptyKernel (ComputeBenchmark ):
991- def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs ):
1055+ def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs , profiler_type ):
9921056 self .wgc = wgc
9931057 self .wgs = wgs
9941058 # iterations per bin_args: --iterations=10000
9951059 self .iterations_regular = 10000
9961060 self .iterations_trace = 10
9971061 super ().__init__ (
998- bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" , runtime
1062+ bench ,
1063+ f"ulls_benchmark_{ runtime .value } " ,
1064+ "EmptyKernel" ,
1065+ runtime ,
1066+ profiler_type ,
9991067 )
10001068
10011069 def supported_runtimes (self ) -> list [RUNTIMES ]:
@@ -1020,11 +1088,14 @@ def get_tags(self):
10201088
10211089 def bin_args (self , run_trace : TracingType = TracingType .NONE ) -> list [str ]:
10221090 iters = self .get_iters (run_trace )
1023- return [
1091+ bin_args = [
10241092 f"--iterations={ iters } " ,
10251093 f"--wgs={ self .wgs } " ,
10261094 f"--wgc={ self .wgc } " ,
10271095 ]
1096+ if self .runtime == RUNTIMES .SYCL :
1097+ bin_args .append (f"--profilerType={ self .profiler_type } " )
1098+ return bin_args
10281099
10291100
10301101class UllsKernelSwitch (ComputeBenchmark ):
0 commit comments