diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py index b8adfe7948c94..dcb23e392dc35 100644 --- a/devops/scripts/benchmarks/benches/compute.py +++ b/devops/scripts/benchmarks/benches/compute.py @@ -3,6 +3,7 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +from itertools import product import os import csv import io @@ -24,6 +25,11 @@ class RUNTIMES(Enum): UR = "ur" +class PROFILERS(Enum): + TIMER = "timer" + CPU_COUNTER = "cpuCounter" + + def runtime_to_name(runtime: RUNTIMES) -> str: return { RUNTIMES.SYCL_PREVIEW: "SYCL Preview", @@ -171,69 +177,116 @@ def benchmarks(self) -> list[Benchmark]: # hand-picked value so that total execution time of the benchmark is # similar on all architectures - long_lernel_exec_time_ioq = [20] + long_kernel_exec_time_ioq = [20] # For BMG server, a new value 200 is used, but we have to create metadata # for both values to keep the dashboard consistent. # See SubmitKernel.enabled() long_kernel_exec_time_ooo = [20, 200] - # The Combo Profiler is available only for selected sycl benchmarks - profiler_types = ["timer", "cpuCounter"] - - for runtime in list(RUNTIMES): - # Add SubmitKernel benchmarks using loops - for in_order_queue in [0, 1]: - for measure_completion in [0, 1]: - for use_events in [0, 1]: - long_kernel_exec_time = ( - long_lernel_exec_time_ioq - if in_order_queue - else long_kernel_exec_time_ooo - ) - for kernel_exec_time in [1, *long_kernel_exec_time]: - for profiler_type in profiler_types: - benches.append( - SubmitKernel( - self, - runtime, - in_order_queue, - measure_completion, - use_events, - kernel_exec_time, - profiler_type, - ) - ) - - # Add SinKernelGraph benchmarks - for with_graphs in [0, 1]: - for num_kernels in [5, 100]: + submit_kernel_params = product( + list(RUNTIMES), + [0, 1], # in_order_queue + [0, 1], # measure_completion + [0, 1], # use_events + ) + for ( + runtime, + in_order_queue, + measure_completion, + use_events, + ) in submit_kernel_params: + long_kernel_exec_time = ( + long_kernel_exec_time_ioq + if in_order_queue + else long_kernel_exec_time_ooo + ) + for kernel_exec_time in [1, *long_kernel_exec_time]: + benches.append( + SubmitKernel( + self, + runtime, + in_order_queue, + measure_completion, + use_events, + kernel_exec_time, + ) + ) + if runtime == RUNTIMES.SYCL: + # Create CPU count variant benches.append( - GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels) + SubmitKernel( + self, + runtime, + in_order_queue, + measure_completion, + use_events, + kernel_exec_time, + profiler_type=PROFILERS.CPU_COUNTER, + ) ) + # Add SinKernelGraph benchmarks + sin_kernel_graph_params = product( + list(RUNTIMES), + [0, 1], # with_graphs + [5, 100], # num_kernels + ) + for runtime, with_graphs, num_kernels in sin_kernel_graph_params: + benches.append( + GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels) + ) + # Add ULLS benchmarks - for profiler_type in profiler_types: - benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type)) + for runtime in list(RUNTIMES): + if runtime == RUNTIMES.SYCL: + benches.append( + UllsEmptyKernel( + self, runtime, 1000, 256, profiler_type=PROFILERS.CPU_COUNTER + ) + ) + benches.append(UllsEmptyKernel(self, runtime, 1000, 256)) benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1)) - # Add GraphApiSubmitGraph benchmarks - for in_order_queue in [0, 1]: - for num_kernels in self.submit_graph_num_kernels: - for measure_completion_time in [0, 1]: - for use_events in [0, 1]: - for profiler_type in profiler_types: - benches.append( - GraphApiSubmitGraph( - self, - runtime, - in_order_queue, - num_kernels, - measure_completion_time, - profiler_type, - use_events, - useHostTasks=0, - ) - ) + # Add GraphApiSubmitGraph benchmarks + submit_graph_params = product( + list(RUNTIMES), + [0, 1], # in_order_queue + self.submit_graph_num_kernels, + [0, 1], # measure_completion_time + [0, 1], # use_events + ) + for ( + runtime, + in_order_queue, + num_kernels, + measure_completion_time, + use_events, + ) in submit_graph_params: + benches.append( + GraphApiSubmitGraph( + self, + runtime, + in_order_queue, + num_kernels, + measure_completion_time, + use_events, + useHostTasks=0, + ) + ) + if runtime == RUNTIMES.SYCL: + # Create CPU count variant + benches.append( + GraphApiSubmitGraph( + self, + runtime, + in_order_queue, + num_kernels, + measure_completion_time, + use_events, + useHostTasks=0, + profiler_type=PROFILERS.CPU_COUNTER, + ) + ) # Add other benchmarks benches += [ @@ -244,7 +297,7 @@ def benchmarks(self) -> list[Benchmark]: GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"), GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"), ] - for profiler_type in profiler_types: + for profiler_type in list(PROFILERS): benches.append( QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type) ) @@ -310,7 +363,12 @@ def parse_unit_type(compute_unit): class ComputeBenchmark(Benchmark): def __init__( - self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = "" + self, + bench, + name, + test, + runtime: RUNTIMES = None, + profiler_type: PROFILERS = PROFILERS.TIMER, ): super().__init__(bench.directory, bench) self.bench = bench @@ -478,7 +536,7 @@ def __init__( MeasureCompletion=0, UseEvents=0, KernelExecTime=1, - profiler_type="", + profiler_type=PROFILERS.TIMER, ): self.ioq = ioq self.MeasureCompletion = MeasureCompletion @@ -578,7 +636,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--UseEvents={self.UseEvents}", ] if self.runtime == RUNTIMES.SYCL: - bin_args.append(f"--profilerType={self.profiler_type}") + bin_args.append(f"--profilerType={self.profiler_type.value}") return bin_args def get_metadata(self) -> dict[str, BenchmarkMetadata]: @@ -647,7 +705,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--dst={self.destination}", f"--size={self.size}", "--withCopyOffload=0", - f"--profilerType={self.profiler_type}", + f"--profilerType={self.profiler_type.value}", ] @@ -693,7 +751,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--size={self.size}", "--count=100", "--withCopyOffload=0", - f"--profilerType={self.profiler_type}", + f"--profilerType={self.profiler_type.value}", ] @@ -731,7 +789,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--sourcePlacement={self.source}", f"--destinationPlacement={self.destination}", f"--size={self.size}", - f"--profilerType={self.profiler_type}", + f"--profilerType={self.profiler_type.value}", ] @@ -970,9 +1028,9 @@ def __init__( inOrderQueue, numKernels, measureCompletionTime, - profiler_type, useEvents, useHostTasks, + profiler_type=PROFILERS.TIMER, ): self.inOrderQueue = inOrderQueue self.numKernels = numKernels @@ -1037,12 +1095,14 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--UseHostTasks={self.useHostTasks}", ] if self.runtime == RUNTIMES.SYCL: - bin_args.append(f"--profilerType={self.profiler_type}") + bin_args.append(f"--profilerType={self.profiler_type.value}") return bin_args class UllsEmptyKernel(ComputeBenchmark): - def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type): + def __init__( + self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type=PROFILERS.TIMER + ): self.wgc = wgc self.wgs = wgs # iterations per bin_args: --iterations=10000 @@ -1084,7 +1144,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--wgc={self.wgc}", ] if self.runtime == RUNTIMES.SYCL: - bin_args.append(f"--profilerType={self.profiler_type}") + bin_args.append(f"--profilerType={self.profiler_type.value}") return bin_args