From 17c92530a564d436b5df131288a8d571f218c463 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= Date: Thu, 18 Sep 2025 11:58:44 +0000 Subject: [PATCH 1/2] Run syclpreview benchmarks once Prevent syclpreview benchmarks scenarios from running twice. --- devops/scripts/benchmarks/benches/compute.py | 81 ++++++++++++++------ 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py index b8adfe7948c94..b0cb3412c6973 100644 --- a/devops/scripts/benchmarks/benches/compute.py +++ b/devops/scripts/benchmarks/benches/compute.py @@ -24,6 +24,11 @@ class RUNTIMES(Enum): UR = "ur" +class PROFILERS(Enum): + TIMER = "timer" + CPU_COUNTER = "cpuCounter" + + def runtime_to_name(runtime: RUNTIMES) -> str: return { RUNTIMES.SYCL_PREVIEW: "SYCL Preview", @@ -171,27 +176,35 @@ def benchmarks(self) -> list[Benchmark]: # hand-picked value so that total execution time of the benchmark is # similar on all architectures - long_lernel_exec_time_ioq = [20] + long_kernel_exec_time_ioq = [20] # For BMG server, a new value 200 is used, but we have to create metadata # for both values to keep the dashboard consistent. # See SubmitKernel.enabled() long_kernel_exec_time_ooo = [20, 200] - # The Combo Profiler is available only for selected sycl benchmarks - profiler_types = ["timer", "cpuCounter"] - for runtime in list(RUNTIMES): # Add SubmitKernel benchmarks using loops for in_order_queue in [0, 1]: for measure_completion in [0, 1]: for use_events in [0, 1]: long_kernel_exec_time = ( - long_lernel_exec_time_ioq + long_kernel_exec_time_ioq if in_order_queue else long_kernel_exec_time_ooo ) for kernel_exec_time in [1, *long_kernel_exec_time]: - for profiler_type in profiler_types: + benches.append( + SubmitKernel( + self, + runtime, + in_order_queue, + measure_completion, + use_events, + kernel_exec_time, + ) + ) + if runtime == RUNTIMES.SYCL: + # Create CPU count variant benches.append( SubmitKernel( self, @@ -200,7 +213,7 @@ def benchmarks(self) -> list[Benchmark]: measure_completion, use_events, kernel_exec_time, - profiler_type, + profiler_type=PROFILERS.CPU_COUNTER, ) ) @@ -212,8 +225,13 @@ def benchmarks(self) -> list[Benchmark]: ) # Add ULLS benchmarks - for profiler_type in profiler_types: - benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type)) + if runtime == RUNTIMES.SYCL: + benches.append( + UllsEmptyKernel( + self, runtime, 1000, 256, profiler_type=PROFILERS.CPU_COUNTER + ) + ) + benches.append(UllsEmptyKernel(self, runtime, 1000, 256)) benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1)) # Add GraphApiSubmitGraph benchmarks @@ -221,7 +239,19 @@ def benchmarks(self) -> list[Benchmark]: for num_kernels in self.submit_graph_num_kernels: for measure_completion_time in [0, 1]: for use_events in [0, 1]: - for profiler_type in profiler_types: + benches.append( + GraphApiSubmitGraph( + self, + runtime, + in_order_queue, + num_kernels, + measure_completion_time, + use_events, + useHostTasks=0, + ) + ) + if runtime == RUNTIMES.SYCL: + # Create CPU count variant benches.append( GraphApiSubmitGraph( self, @@ -229,9 +259,9 @@ def benchmarks(self) -> list[Benchmark]: in_order_queue, num_kernels, measure_completion_time, - profiler_type, use_events, useHostTasks=0, + profiler_type=PROFILERS.CPU_COUNTER, ) ) @@ -244,7 +274,7 @@ def benchmarks(self) -> list[Benchmark]: GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"), GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"), ] - for profiler_type in profiler_types: + for profiler_type in list(PROFILERS): benches.append( QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type) ) @@ -310,7 +340,12 @@ def parse_unit_type(compute_unit): class ComputeBenchmark(Benchmark): def __init__( - self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = "" + self, + bench, + name, + test, + runtime: RUNTIMES = None, + profiler_type: PROFILERS = PROFILERS.TIMER, ): super().__init__(bench.directory, bench) self.bench = bench @@ -478,7 +513,7 @@ def __init__( MeasureCompletion=0, UseEvents=0, KernelExecTime=1, - profiler_type="", + profiler_type=PROFILERS.TIMER, ): self.ioq = ioq self.MeasureCompletion = MeasureCompletion @@ -578,7 +613,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--UseEvents={self.UseEvents}", ] if self.runtime == RUNTIMES.SYCL: - bin_args.append(f"--profilerType={self.profiler_type}") + bin_args.append(f"--profilerType={self.profiler_type.value}") return bin_args def get_metadata(self) -> dict[str, BenchmarkMetadata]: @@ -647,7 +682,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--dst={self.destination}", f"--size={self.size}", "--withCopyOffload=0", - f"--profilerType={self.profiler_type}", + f"--profilerType={self.profiler_type.value}", ] @@ -693,7 +728,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--size={self.size}", "--count=100", "--withCopyOffload=0", - f"--profilerType={self.profiler_type}", + f"--profilerType={self.profiler_type.value}", ] @@ -731,7 +766,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--sourcePlacement={self.source}", f"--destinationPlacement={self.destination}", f"--size={self.size}", - f"--profilerType={self.profiler_type}", + f"--profilerType={self.profiler_type.value}", ] @@ -970,9 +1005,9 @@ def __init__( inOrderQueue, numKernels, measureCompletionTime, - profiler_type, useEvents, useHostTasks, + profiler_type=PROFILERS.TIMER, ): self.inOrderQueue = inOrderQueue self.numKernels = numKernels @@ -1037,12 +1072,14 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--UseHostTasks={self.useHostTasks}", ] if self.runtime == RUNTIMES.SYCL: - bin_args.append(f"--profilerType={self.profiler_type}") + bin_args.append(f"--profilerType={self.profiler_type.value}") return bin_args class UllsEmptyKernel(ComputeBenchmark): - def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type): + def __init__( + self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type=PROFILERS.TIMER + ): self.wgc = wgc self.wgs = wgs # iterations per bin_args: --iterations=10000 @@ -1084,7 +1121,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]: f"--wgc={self.wgc}", ] if self.runtime == RUNTIMES.SYCL: - bin_args.append(f"--profilerType={self.profiler_type}") + bin_args.append(f"--profilerType={self.profiler_type.value}") return bin_args From dfb4e996c470a7029a3fe7c76a83db2e75297e9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= Date: Thu, 18 Sep 2025 13:42:27 +0000 Subject: [PATCH 2/2] Refactor nested loops in ComputeBench benches instantiation --- devops/scripts/benchmarks/benches/compute.py | 161 +++++++++++-------- 1 file changed, 92 insertions(+), 69 deletions(-) diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py index b0cb3412c6973..dcb23e392dc35 100644 --- a/devops/scripts/benchmarks/benches/compute.py +++ b/devops/scripts/benchmarks/benches/compute.py @@ -3,6 +3,7 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +from itertools import product import os import csv import io @@ -182,49 +183,61 @@ def benchmarks(self) -> list[Benchmark]: # See SubmitKernel.enabled() long_kernel_exec_time_ooo = [20, 200] - for runtime in list(RUNTIMES): - # Add SubmitKernel benchmarks using loops - for in_order_queue in [0, 1]: - for measure_completion in [0, 1]: - for use_events in [0, 1]: - long_kernel_exec_time = ( - long_kernel_exec_time_ioq - if in_order_queue - else long_kernel_exec_time_ooo - ) - for kernel_exec_time in [1, *long_kernel_exec_time]: - benches.append( - SubmitKernel( - self, - runtime, - in_order_queue, - measure_completion, - use_events, - kernel_exec_time, - ) - ) - if runtime == RUNTIMES.SYCL: - # Create CPU count variant - benches.append( - SubmitKernel( - self, - runtime, - in_order_queue, - measure_completion, - use_events, - kernel_exec_time, - profiler_type=PROFILERS.CPU_COUNTER, - ) - ) - - # Add SinKernelGraph benchmarks - for with_graphs in [0, 1]: - for num_kernels in [5, 100]: + submit_kernel_params = product( + list(RUNTIMES), + [0, 1], # in_order_queue + [0, 1], # measure_completion + [0, 1], # use_events + ) + for ( + runtime, + in_order_queue, + measure_completion, + use_events, + ) in submit_kernel_params: + long_kernel_exec_time = ( + long_kernel_exec_time_ioq + if in_order_queue + else long_kernel_exec_time_ooo + ) + for kernel_exec_time in [1, *long_kernel_exec_time]: + benches.append( + SubmitKernel( + self, + runtime, + in_order_queue, + measure_completion, + use_events, + kernel_exec_time, + ) + ) + if runtime == RUNTIMES.SYCL: + # Create CPU count variant benches.append( - GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels) + SubmitKernel( + self, + runtime, + in_order_queue, + measure_completion, + use_events, + kernel_exec_time, + profiler_type=PROFILERS.CPU_COUNTER, + ) ) + # Add SinKernelGraph benchmarks + sin_kernel_graph_params = product( + list(RUNTIMES), + [0, 1], # with_graphs + [5, 100], # num_kernels + ) + for runtime, with_graphs, num_kernels in sin_kernel_graph_params: + benches.append( + GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels) + ) + # Add ULLS benchmarks + for runtime in list(RUNTIMES): if runtime == RUNTIMES.SYCL: benches.append( UllsEmptyKernel( @@ -234,36 +247,46 @@ def benchmarks(self) -> list[Benchmark]: benches.append(UllsEmptyKernel(self, runtime, 1000, 256)) benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1)) - # Add GraphApiSubmitGraph benchmarks - for in_order_queue in [0, 1]: - for num_kernels in self.submit_graph_num_kernels: - for measure_completion_time in [0, 1]: - for use_events in [0, 1]: - benches.append( - GraphApiSubmitGraph( - self, - runtime, - in_order_queue, - num_kernels, - measure_completion_time, - use_events, - useHostTasks=0, - ) - ) - if runtime == RUNTIMES.SYCL: - # Create CPU count variant - benches.append( - GraphApiSubmitGraph( - self, - runtime, - in_order_queue, - num_kernels, - measure_completion_time, - use_events, - useHostTasks=0, - profiler_type=PROFILERS.CPU_COUNTER, - ) - ) + # Add GraphApiSubmitGraph benchmarks + submit_graph_params = product( + list(RUNTIMES), + [0, 1], # in_order_queue + self.submit_graph_num_kernels, + [0, 1], # measure_completion_time + [0, 1], # use_events + ) + for ( + runtime, + in_order_queue, + num_kernels, + measure_completion_time, + use_events, + ) in submit_graph_params: + benches.append( + GraphApiSubmitGraph( + self, + runtime, + in_order_queue, + num_kernels, + measure_completion_time, + use_events, + useHostTasks=0, + ) + ) + if runtime == RUNTIMES.SYCL: + # Create CPU count variant + benches.append( + GraphApiSubmitGraph( + self, + runtime, + in_order_queue, + num_kernels, + measure_completion_time, + use_events, + useHostTasks=0, + profiler_type=PROFILERS.CPU_COUNTER, + ) + ) # Add other benchmarks benches += [