cleanup compute bench, fix readme, use newer sycl-bench

pbalcer · pbalcer · commit fae04f46984b · 2025-03-14T13:30:46.000+01:00
diff --git a/devops/scripts/benchmarks/README.md b/devops/scripts/benchmarks/README.md
@@ -6,6 +6,8 @@ Scripts for running performance tests on SYCL and Unified Runtime.
 
 - [Velocity Bench](https://github.com/oneapi-src/Velocity-Bench)
 - [Compute Benchmarks](https://github.com/intel/compute-benchmarks/)
+- [LlamaCpp Benchmarks](https://github.com/ggerganov/llama.cpp)
+- [SYCL-Bench](https://github.com/unisa-hpc/sycl-bench)
 
 ## Running
 
@@ -27,8 +29,6 @@ You can also include additional benchmark parameters, such as environment variab
 
 Once all the required information is entered, click the "Run workflow" button to initiate a new workflow run. This will execute the benchmarks and then post the results as a comment on the specified Pull Request.
 
-By default, all benchmark runs are compared against `baseline`, which is a well-established set of the latest data.
-
 You must be a member of the `oneapi-src` organization to access these features.
 
 ## Comparing results
@@ -37,8 +37,8 @@ By default, the benchmark results are not stored. To store them, use the option
 
 You can compare benchmark results using `--compare` option. The comparison will be presented in a markdown output file (see below). If you want to calculate the relative performance of the new results against the previously saved data, use `--compare <previously_saved_data>` (i.e. `--compare baseline`). In case of comparing only stored data without generating new results, use `--dry-run --compare <name1> --compare <name2> --relative-perf <name1>`, where `name1` indicates the baseline for the relative performance calculation and `--dry-run` prevents the script for running benchmarks. Listing more than two `--compare` options results in displaying only execution time, without statistical analysis.
 
-Baseline, as well as baseline-v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
-are stored [here](https://oneapi-src.github.io/unified-runtime/benchmark_results.html).
+Baseline_L0, as well as Baseline_L0v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
+are stored [here](https://oneapi-src.github.io/unified-runtime/performance/).
 
 ## Output formats
 You can display the results in the form of a HTML file by using `--ouptut-html` and a markdown file by using `--output-markdown`. Due to character limits for posting PR comments, the final content of the markdown file might be reduced. In order to obtain the full markdown output, use `--output-markdown full`.
diff --git a/devops/scripts/benchmarks/benches/base.py b/devops/scripts/benchmarks/benches/base.py
@@ -75,12 +75,6 @@ def download(
         self.data_path = self.create_data_path(name, skip_data_dir)
         return download(self.data_path, url, file, untar, unzip, checksum)
 
-    def name(self):
-        raise NotImplementedError()
-
-    def description(self):
-        return "No description provided."
-
     def lower_is_better(self):
         return True
 
@@ -99,8 +93,11 @@ def stddev_threshold(self):
     def get_suite_name(self) -> str:
         return self.suite.name()
 
-    def result_names(self) -> list[str]:
-        return [self.name()]
+    def name(self):
+        raise NotImplementedError()
+
+    def description(self):
+        return "No description provided."
 
     def notes(self) -> str:
         return None
diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py
@@ -13,6 +13,20 @@
 from enum import Enum
 
 
+class RUNTIMES(Enum):
+    SYCL = "sycl"
+    LEVEL_ZERO = "l0"
+    UR = "ur"
+
+
+def runtime_to_name(runtime: RUNTIMES) -> str:
+    return {
+        RUNTIMES.SYCL: "SYCL",
+        RUNTIMES.LEVEL_ZERO: "Level Zero",
+        RUNTIMES.UR: "Unified Runtime",
+    }[runtime]
+
+
 class ComputeBench(Suite):
     def __init__(self, directory):
         self.directory = directory
@@ -70,64 +84,78 @@ def additionalMetadata(self) -> dict[str, BenchmarkMetadata]:
             ),
         }
 
+    def enabled_runtimes(self, supported_runtimes=None):
+        # all runtimes in the RUNTIMES enum
+        runtimes = supported_runtimes or list(RUNTIMES)
+
+        # Filter out UR if not available
+        if options.ur is None:
+            runtimes = [r for r in runtimes if r != RUNTIMES.UR]
+
+        return runtimes
+
     def benchmarks(self) -> list[Benchmark]:
         if options.sycl is None:
             return []
 
         if options.ur_adapter == "cuda":
             return []
 
-        benches = [
-            SubmitKernelL0(self, 0),
-            SubmitKernelL0(self, 1),
-            SubmitKernelSYCL(self, 0),
-            SubmitKernelSYCL(self, 1),
+        benches = []
+
+        # Add SubmitKernel benchmarks using loops
+        for runtime in self.enabled_runtimes():
+            for in_order_queue in [0, 1]:
+                for measure_completion in [0, 1]:
+                    benches.append(
+                        SubmitKernel(self, runtime, in_order_queue, measure_completion)
+                    )
+
+        # Add SinKernelGraph benchmarks
+        for runtime in self.enabled_runtimes():
+            for with_graphs in [0, 1]:
+                for num_kernels in [5, 100]:
+                    benches.append(
+                        GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
+                    )
+
+        # Add ULLS benchmarks
+        for runtime in self.enabled_runtimes([RUNTIMES.SYCL, RUNTIMES.LEVEL_ZERO]):
+            benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
+            benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
+
+        # Add GraphApiSubmitGraph benchmarks
+        for runtime in self.enabled_runtimes([RUNTIMES.SYCL]):
+            for in_order_queue in [0, 1]:
+                for num_kernels in [4, 10, 32]:
+                    for measure_completion_time in [0, 1]:
+                        benches.append(
+                            GraphApiSubmitGraph(
+                                self,
+                                runtime,
+                                in_order_queue,
+                                num_kernels,
+                                measure_completion_time,
+                            )
+                        )
+
+        # Add other benchmarks
+        benches += [
             QueueInOrderMemcpy(self, 0, "Device", "Device", 1024),
             QueueInOrderMemcpy(self, 0, "Host", "Device", 1024),
             QueueMemcpy(self, "Device", "Device", 1024),
             StreamMemory(self, "Triad", 10 * 1024, "Device"),
             ExecImmediateCopyQueue(self, 0, 1, "Device", "Device", 1024),
             ExecImmediateCopyQueue(self, 1, 1, "Device", "Host", 1024),
             VectorSum(self),
-            GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 5),
-            GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 1, 5),
-            GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 0, 100),
-            GraphApiSinKernelGraph(self, RUNTIMES.SYCL, 1, 100),
-            GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 0, 5),
-            GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 1, 5),
-            GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 0, 100),
-            GraphApiSinKernelGraph(self, RUNTIMES.LEVEL_ZERO, 1, 100),
-            UllsEmptyKernel(self, RUNTIMES.SYCL, 1000, 256),
-            UllsEmptyKernel(self, RUNTIMES.LEVEL_ZERO, 1000, 256),
-            UllsKernelSwitch(self, RUNTIMES.SYCL, 8, 200, 0, 0, 1, 1),
-            UllsKernelSwitch(self, RUNTIMES.LEVEL_ZERO, 8, 200, 0, 0, 1, 1),
         ]
 
-        for in_order_queue in [0, 1]:
-            for num_kernels in [4, 32]:
-                for measure_completion_time in [0, 1]:
-                    benches.append(
-                        GraphApiSubmitGraph(
-                            self,
-                            RUNTIMES.SYCL,
-                            in_order_queue,
-                            num_kernels,
-                            measure_completion_time,
-                        )
-                    )
-
+        # Add UR-specific benchmarks
         if options.ur is not None:
             benches += [
-                SubmitKernelUR(self, 0, 0),
-                SubmitKernelUR(self, 1, 0),
-                SubmitKernelUR(self, 1, 1),
                 MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1),
                 MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1),
                 MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0),
-                GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 5),
-                GraphApiSinKernelGraph(self, RUNTIMES.UR, 1, 5),
-                GraphApiSinKernelGraph(self, RUNTIMES.UR, 0, 100),
-                GraphApiSinKernelGraph(self, RUNTIMES.UR, 1, 100),
             ]
 
         return benches
@@ -228,98 +256,49 @@ def teardown(self):
         return
 
 
-class SubmitKernelSYCL(ComputeBenchmark):
-    def __init__(self, bench, ioq):
+class SubmitKernel(ComputeBenchmark):
+    def __init__(self, bench, runtime: RUNTIMES, ioq, measure_completion=0):
         self.ioq = ioq
-        super().__init__(bench, "api_overhead_benchmark_sycl", "SubmitKernel")
+        self.runtime = runtime
+        self.measure_completion = measure_completion
+        super().__init__(
+            bench, f"api_overhead_benchmark_{runtime.value}", "SubmitKernel"
+        )
 
     def name(self):
         order = "in order" if self.ioq else "out of order"
-        return f"api_overhead_benchmark_sycl SubmitKernel {order}"
+        completion_str = " with measure completion" if self.measure_completion else ""
+        return f"api_overhead_benchmark_{self.runtime.value} SubmitKernel {order}{completion_str}"
 
     def explicit_group(self):
-        return "SubmitKernel"
-
-    def bin_args(self) -> list[str]:
-        return [
-            f"--Ioq={self.ioq}",
-            "--DiscardEvents=0",
-            "--MeasureCompletion=0",
-            "--iterations=100000",
-            "--Profiling=0",
-            "--NumKernels=10",
-            "--KernelExecTime=1",
-        ]
-
-    def description(self) -> str:
-        order = "in-order" if self.ioq else "out-of-order"
         return (
-            f"Measures CPU time overhead of submitting {order} kernels through SYCL API."
-            "Uses 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time."
-        )
-
-
-class SubmitKernelUR(ComputeBenchmark):
-    def __init__(self, bench, ioq, measureCompletion):
-        self.ioq = ioq
-        self.measureCompletion = measureCompletion
-        super().__init__(bench, "api_overhead_benchmark_ur", "SubmitKernel")
-
-    def name(self):
-        order = "in order" if self.ioq else "out of order"
-        return f"api_overhead_benchmark_ur SubmitKernel {order}" + (
-            " with measure completion" if self.measureCompletion else ""
+            "SubmitKernel"
+            if self.measure_completion == 0
+            else "SubmitKernel With Completion"
         )
 
-    def explicit_group(self):
-        return "SubmitKernel"
-
     def description(self) -> str:
         order = "in-order" if self.ioq else "out-of-order"
-        completion = "including" if self.measureCompletion else "excluding"
-        return (
-            f"Measures CPU time overhead of submitting {order} kernels through Unified Runtime API, "
-            f"{completion} kernel completion time. Uses 10 simple kernels with minimal execution time "
-            f"to isolate API overhead."
-        )
+        runtime_name = runtime_to_name(self.runtime)
 
-    def bin_args(self) -> list[str]:
-        return [
-            f"--Ioq={self.ioq}",
-            "--DiscardEvents=0",
-            f"--MeasureCompletion={self.measureCompletion}",
-            "--iterations=100000",
-            "--Profiling=0",
-            "--NumKernels=10",
-            "--KernelExecTime=1",
-        ]
-
-
-class SubmitKernelL0(ComputeBenchmark):
-    def __init__(self, bench, ioq):
-        self.ioq = ioq
-        super().__init__(bench, "api_overhead_benchmark_l0", "SubmitKernel")
-
-    def name(self):
-        order = "in order" if self.ioq else "out of order"
-        return f"api_overhead_benchmark_l0 SubmitKernel {order}"
+        completion_desc = ""
+        if self.runtime == RUNTIMES.UR:
+            completion_desc = f", {'including' if self.measure_completion else 'excluding'} kernel completion time"
 
-    def explicit_group(self):
-        return "SubmitKernel"
+        l0_specific = ""
+        if self.runtime == RUNTIMES.LEVEL_ZERO:
+            l0_specific = " Uses immediate command lists"
 
-    def description(self) -> str:
-        order = "in-order" if self.ioq else "out-of-order"
         return (
-            f"Measures CPU time overhead of submitting {order} kernels through Level Zero API. "
-            f"Uses immediate command lists with 10 minimal kernels to isolate submission overhead "
-            f"from execution time."
+            f"Measures CPU time overhead of submitting {order} kernels through {runtime_name} API{completion_desc}. "
+            f"Runs 10 simple kernels with minimal execution time to isolate API overhead from kernel execution time. {l0_specific}"
         )
 
     def bin_args(self) -> list[str]:
         return [
             f"--Ioq={self.ioq}",
             "--DiscardEvents=0",
-            "--MeasureCompletion=0",
+            f"--MeasureCompletion={self.measure_completion}",
             "--iterations=100000",
             "--Profiling=0",
             "--NumKernels=10",
@@ -521,12 +500,6 @@ def bin_args(self) -> list[str]:
         ]
 
 
-class RUNTIMES(Enum):
-    SYCL = "sycl"
-    LEVEL_ZERO = "l0"
-    UR = "ur"
-
-
 class GraphApiSinKernelGraph(ComputeBenchmark):
     def __init__(self, bench, runtime: RUNTIMES, withGraphs, numKernels):
         self.withGraphs = withGraphs
diff --git a/devops/scripts/benchmarks/benches/syclbench.py b/devops/scripts/benchmarks/benches/syclbench.py
@@ -31,8 +31,8 @@ def setup(self):
         repo_path = git_clone(
             self.directory,
             "sycl-bench-repo",
-            "https://github.com/mateuszpn/sycl-bench.git",
-            "1e6ab2cfd004a72c5336c26945965017e06eab71",
+            "https://github.com/unisa-hpc/sycl-bench.git",
+            "31fc70be6266193c4ba60eb1fe3ce26edee4ca5b",
         )
 
         configure_command = [
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
@@ -333,7 +333,7 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument(
         "--adapter",
         type=str,
-        help="Options to build the Unified Runtime as part of the benchmark",
+        help="Unified Runtime adapter to use.",
         default="level_zero",
     )
     parser.add_argument(

Original file line number	Diff line number	Diff line change
`@@ -333,7 +333,7 @@ def validate_and_parse_env_args(env_args):`
`333`	`333`	`parser.add_argument(`
`334`	`334`	`"--adapter",`
`335`	`335`	`type=str,`
`336`		`- help="Options to build the Unified Runtime as part of the benchmark",`
	`336`	`+ help="Unified Runtime adapter to use.",`
`337`	`337`	`default="level_zero",`
`338`	`338`	`)`
`339`	`339`	`parser.add_argument(`