@@ -115,37 +115,11 @@ def additional_metadata(self) -> dict[str, BenchmarkMetadata]:
115115 ),
116116 }
117117
118- def enabled_runtimes (self , supported_runtimes = None , extra_runtimes = None ):
119- # all runtimes in the RUNTIMES enum
120- runtimes = supported_runtimes or list (RUNTIMES )
121-
122- # filter out SYCL_PREVIEW which is not supported by default in all benchmarks
123- runtimes = [r for r in runtimes if r != RUNTIMES .SYCL_PREVIEW ]
124-
125- if extra_runtimes is not None :
126- runtimes .extend (extra_runtimes )
127-
128- # Filter out UR if not available
129- if options .ur is None :
130- runtimes = [r for r in runtimes if r != RUNTIMES .UR ]
131-
132- # Filter out L0 if cuda backend
133- if options .ur_adapter == "cuda" :
134- runtimes = [r for r in runtimes if r != RUNTIMES .LEVEL_ZERO ]
135-
136- return runtimes
137-
138118 def benchmarks (self ) -> list [Benchmark ]:
139- if options .sycl is None :
140- return []
141-
142- if options .ur_adapter == "hip" :
143- return []
144-
145119 benches = []
146120
147- # Add SubmitKernel benchmarks using loops
148- for runtime in self . enabled_runtimes ( extra_runtimes = [ RUNTIMES . SYCL_PREVIEW ]):
121+ for runtime in list ( RUNTIMES ):
122+ # Add SubmitKernel benchmarks using loops
149123 for in_order_queue in [0 , 1 ]:
150124 for measure_completion in [0 , 1 ]:
151125 for use_events in [0 , 1 ]:
@@ -161,21 +135,18 @@ def benchmarks(self) -> list[Benchmark]:
161135 )
162136 )
163137
164- # Add SinKernelGraph benchmarks
165- for runtime in self .enabled_runtimes ():
138+ # Add SinKernelGraph benchmarks
166139 for with_graphs in [0 , 1 ]:
167140 for num_kernels in [5 , 100 ]:
168141 benches .append (
169142 GraphApiSinKernelGraph (self , runtime , with_graphs , num_kernels )
170143 )
171144
172- # Add ULLS benchmarks
173- for runtime in self .enabled_runtimes ([RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]):
145+ # Add ULLS benchmarks
174146 benches .append (UllsEmptyKernel (self , runtime , 1000 , 256 ))
175147 benches .append (UllsKernelSwitch (self , runtime , 8 , 200 , 0 , 0 , 1 , 1 ))
176148
177- # Add GraphApiSubmitGraph benchmarks
178- for runtime in self .enabled_runtimes ():
149+ # Add GraphApiSubmitGraph benchmarks
179150 for in_order_queue in [0 , 1 ]:
180151 for num_kernels in [4 , 10 , 32 ]:
181152 for measure_completion_time in [0 , 1 ]:
@@ -201,24 +172,24 @@ def benchmarks(self) -> list[Benchmark]:
201172 ]
202173
203174 # Add UR-specific benchmarks
204- if options . ur is not None :
205- benches += [
206- MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 1 , 1 , 1 , 1 , 0 ),
207- MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 0 , 1 , 1 , 1 , 0 ),
208- MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 1 , 0 ),
209- MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 0 , 0 ),
210- MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 0 ),
211- MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 1 ),
212- UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 , "Both" ),
213- UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 * 1024 , "Both" ),
214- UsmBatchMemoryAllocation (self , RUNTIMES . UR , "Device" , 128 , 256 , "Both" ),
215- UsmBatchMemoryAllocation (
216- self , RUNTIMES . UR , "Device" , 128 , 16 * 1024 , "Both"
217- ),
218- UsmBatchMemoryAllocation (
219- self , RUNTIMES . UR , "Device" , 128 , 128 * 1024 , "Both"
220- ),
221- ]
175+ benches += [
176+ MemcpyExecute ( self , RUNTIMES . UR , 400 , 1 , 102400 , 10 , 1 , 1 , 1 , 1 , 0 ),
177+ MemcpyExecute (self , RUNTIMES .UR , 400 , 1 , 102400 , 10 , 0 , 1 , 1 , 1 , 0 ),
178+ MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 1 , 0 ),
179+ MemcpyExecute (self , RUNTIMES .UR , 100 , 4 , 102400 , 10 , 1 , 1 , 0 , 0 , 0 ),
180+ MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 0 ),
181+ MemcpyExecute (self , RUNTIMES .UR , 4096 , 4 , 1024 , 10 , 0 , 1 , 0 , 1 , 1 ),
182+ UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 , "Both" ),
183+ UsmMemoryAllocation (self , RUNTIMES .UR , "Device" , 256 * 1024 , "Both" ),
184+ UsmBatchMemoryAllocation (self , RUNTIMES .UR , "Device" , 128 , 256 , "Both" ),
185+ UsmBatchMemoryAllocation (
186+ self , RUNTIMES . UR , "Device" , 128 , 16 * 1024 , "Both"
187+ ),
188+ UsmBatchMemoryAllocation (
189+ self , RUNTIMES . UR , "Device" , 128 , 128 * 1024 , "Both"
190+ ),
191+ ]
192+
222193 benches += [
223194 MemcpyExecute (
224195 self , RUNTIMES .SYCL_PREVIEW , 4096 , 1 , 1024 , 40 , 1 , 1 , 0 , 1 , 0
@@ -246,11 +217,44 @@ def parse_unit_type(compute_unit):
246217
247218
248219class ComputeBenchmark (Benchmark ):
249- def __init__ (self , bench , name , test ):
220+ def __init__ (self , bench , name , test , runtime : RUNTIMES = None ):
250221 super ().__init__ (bench .directory , bench )
251222 self .bench = bench
252223 self .bench_name = name
253224 self .test = test
225+ self .runtime = runtime
226+
227+ def supported_runtimes (self ) -> list [RUNTIMES ]:
228+ """Base runtimes supported by this benchmark, can be overridden."""
229+ # By default, support all runtimes except SYCL_PREVIEW
230+ return [r for r in RUNTIMES if r != RUNTIMES .SYCL_PREVIEW ]
231+
232+ def enabled_runtimes (self ) -> list [RUNTIMES ]:
233+ """Runtimes available given the current configuration."""
234+ # Start with all supported runtimes and apply configuration filters
235+ runtimes = self .supported_runtimes ()
236+
237+ # Remove UR if not available
238+ if options .ur is None :
239+ runtimes = [r for r in runtimes if r != RUNTIMES .UR ]
240+
241+ # Remove Level Zero if using CUDA backend
242+ if options .ur_adapter == "cuda" :
243+ runtimes = [r for r in runtimes if r != RUNTIMES .LEVEL_ZERO ]
244+
245+ return runtimes
246+
247+ def enabled (self ) -> bool :
248+ # SYCL is required for all benchmarks
249+ if options .sycl is None :
250+ return False
251+
252+ # HIP adapter is not supported
253+ if options .ur_adapter == "hip" :
254+ return False
255+
256+ # Check if the specific runtime is enabled (or no specific runtime required)
257+ return self .runtime is None or self .runtime in self .enabled_runtimes ()
254258
255259 def bin_args (self ) -> list [str ]:
256260 return []
@@ -338,15 +342,17 @@ def __init__(
338342 KernelExecTime = 1 ,
339343 ):
340344 self .ioq = ioq
341- self .runtime = runtime
342345 self .MeasureCompletion = MeasureCompletion
343346 self .UseEvents = UseEvents
344347 self .KernelExecTime = KernelExecTime
345348 self .NumKernels = 10
346349 super ().__init__ (
347- bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel"
350+ bench , f"api_overhead_benchmark_{ runtime .value } " , "SubmitKernel" , runtime
348351 )
349352
353+ def supported_runtimes (self ) -> list [RUNTIMES ]:
354+ return super ().supported_runtimes () + [RUNTIMES .SYCL_PREVIEW ]
355+
350356 def get_tags (self ):
351357 return ["submit" , "latency" , runtime_to_tag_name (self .runtime ), "micro" ]
352358
@@ -619,7 +625,6 @@ def __init__(
619625 useCopyOffload ,
620626 useBarrier ,
621627 ):
622- self .runtime = runtime
623628 self .numOpsPerThread = numOpsPerThread
624629 self .numThreads = numThreads
625630 self .allocSize = allocSize
@@ -630,7 +635,7 @@ def __init__(
630635 self .useCopyOffload = useCopyOffload
631636 self .useBarrier = useBarrier
632637 super ().__init__ (
633- bench , f"multithread_benchmark_{ self . runtime .value } " , "MemcpyExecute"
638+ bench , f"multithread_benchmark_{ runtime .value } " , "MemcpyExecute" , runtime
634639 )
635640
636641 def extra_env_vars (self ) -> dict :
@@ -706,9 +711,8 @@ class GraphApiSinKernelGraph(ComputeBenchmark):
706711 def __init__ (self , bench , runtime : RUNTIMES , withGraphs , numKernels ):
707712 self .withGraphs = withGraphs
708713 self .numKernels = numKernels
709- self .runtime = runtime
710714 super ().__init__ (
711- bench , f"graph_api_benchmark_{ runtime .value } " , "SinKernelGraph"
715+ bench , f"graph_api_benchmark_{ runtime .value } " , "SinKernelGraph" , runtime
712716 )
713717
714718 def explicit_group (self ):
@@ -759,9 +763,10 @@ def __init__(
759763 ):
760764 self .inOrderQueue = inOrderQueue
761765 self .numKernels = numKernels
762- self .runtime = runtime
763766 self .measureCompletionTime = measureCompletionTime
764- super ().__init__ (bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" )
767+ super ().__init__ (
768+ bench , f"graph_api_benchmark_{ runtime .value } " , "SubmitGraph" , runtime
769+ )
765770
766771 def explicit_group (self ):
767772 return f"SubmitGraph, numKernels: { self .numKernels } "
@@ -804,8 +809,12 @@ class UllsEmptyKernel(ComputeBenchmark):
804809 def __init__ (self , bench , runtime : RUNTIMES , wgc , wgs ):
805810 self .wgc = wgc
806811 self .wgs = wgs
807- self .runtime = runtime
808- super ().__init__ (bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" )
812+ super ().__init__ (
813+ bench , f"ulls_benchmark_{ runtime .value } " , "EmptyKernel" , runtime
814+ )
815+
816+ def supported_runtimes (self ) -> list [RUNTIMES ]:
817+ return [RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]
809818
810819 def explicit_group (self ):
811820 return f"EmptyKernel, wgc: { self .wgc } , wgs: { self .wgs } "
@@ -849,9 +858,13 @@ def __init__(
849858 self .barrier = barrier
850859 self .hostVisible = hostVisible
851860 self .ctrBasedEvents = ctrBasedEvents
852- self .runtime = runtime
853861 self .ioq = ioq
854- super ().__init__ (bench , f"ulls_benchmark_{ runtime .value } " , "KernelSwitch" )
862+ super ().__init__ (
863+ bench , f"ulls_benchmark_{ runtime .value } " , "KernelSwitch" , runtime
864+ )
865+
866+ def supported_runtimes (self ):
867+ return [RUNTIMES .SYCL , RUNTIMES .LEVEL_ZERO ]
855868
856869 def explicit_group (self ):
857870 return f"KernelSwitch, count: { self .count } , kernelTime: { self .kernelTime } "
@@ -884,12 +897,14 @@ class UsmMemoryAllocation(ComputeBenchmark):
884897 def __init__ (
885898 self , bench , runtime : RUNTIMES , usm_memory_placement , size , measure_mode
886899 ):
887- self .runtime = runtime
888900 self .usm_memory_placement = usm_memory_placement
889901 self .size = size
890902 self .measure_mode = measure_mode
891903 super ().__init__ (
892- bench , f"api_overhead_benchmark_{ runtime .value } " , "UsmMemoryAllocation"
904+ bench ,
905+ f"api_overhead_benchmark_{ runtime .value } " ,
906+ "UsmMemoryAllocation" ,
907+ runtime ,
893908 )
894909
895910 def get_tags (self ):
@@ -941,13 +956,15 @@ def __init__(
941956 size ,
942957 measure_mode ,
943958 ):
944- self .runtime = runtime
945959 self .usm_memory_placement = usm_memory_placement
946960 self .allocation_count = allocation_count
947961 self .size = size
948962 self .measure_mode = measure_mode
949963 super ().__init__ (
950- bench , f"api_overhead_benchmark_{ runtime .value } " , "UsmBatchMemoryAllocation"
964+ bench ,
965+ f"api_overhead_benchmark_{ runtime .value } " ,
966+ "UsmBatchMemoryAllocation" ,
967+ runtime ,
951968 )
952969
953970 def get_tags (self ):
0 commit comments