Skip to content

Commit b8204f8

Browse files
[Bench] Add stats for benchmark executions in GHA summary (#20638)
1 parent 9d5b4b8 commit b8204f8

File tree

4 files changed

+140
-23
lines changed

4 files changed

+140
-23
lines changed

devops/actions/run-tests/benchmark/action.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,10 @@ runs:
231231
WORKDIR="$(realpath ./llvm_test_workdir)"
232232
if [ -n "$WORKDIR" ] && [ -d "$WORKDIR" ] && [[ "$WORKDIR" == *llvm_test_workdir* ]]; then rm -rf "$WORKDIR" ; fi
233233

234+
# Clean up potentially existing, old summary files
235+
[ -f "github_summary_exe.md" ] && rm github_summary_exe.md
236+
[ -f "github_summary_reg.md" ] && rm github_summary_reg.md
237+
234238
numactl --cpunodebind "$NUMA_NODE" --membind "$NUMA_NODE" \
235239
./devops/scripts/benchmarks/main.py "$WORKDIR" \
236240
--sycl "$(realpath ./toolchain)" \
@@ -243,6 +247,7 @@ runs:
243247
--preset "$PRESET" \
244248
--timestamp-override "$SAVE_TIMESTAMP" \
245249
--detect-version sycl,compute_runtime \
250+
--produce-github-summary \
246251
${{ inputs.exit_on_failure == 'true' && '--exit-on-failure --iterations 1' || '' }}
247252
# TODO: add back: "--flamegraph inclusive" once works properly
248253

@@ -273,8 +278,9 @@ runs:
273278
if: always()
274279
shell: bash
275280
run: |
276-
# Cache changes and upload github summary
277-
[ -f "github_summary.md" ] && cat github_summary.md >> $GITHUB_STEP_SUMMARY
281+
# Cache changes and upload github summaries
282+
[ -f "github_summary_exe.md" ] && cat github_summary_exe.md >> $GITHUB_STEP_SUMMARY
283+
[ -f "github_summary_reg.md" ] && cat github_summary_reg.md >> $GITHUB_STEP_SUMMARY
278284
279285
cd "./llvm-ci-perf-results"
280286
git add .

devops/scripts/benchmarks/compare.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,8 @@ def to_hist(
357357
parser_avg.add_argument(
358358
"--produce-github-summary",
359359
action="store_true",
360-
help=f"Create a summary file '{options.github_summary_filename}' for Github workflow summaries.",
360+
help=f"Produce regression summary for Github workflow, in file '{options.github_summary_regression_filename}'.",
361+
default=False,
361362
)
362363

363364
args = parser.parse_args()
@@ -473,14 +474,16 @@ def print_regression(entry: dict, is_warning: bool = False):
473474

474475
if not args.dry_run:
475476
if args.produce_github_summary:
476-
with open(options.github_summary_filename, "w") as f:
477+
with open(options.github_summary_regression_filename, "w") as f:
477478
f.write("\n".join(gh_summary))
478479
exit(1) # Exit 1 to trigger Github test failure
479480

480481
log.info("No unexpected regressions found!")
481482
if args.produce_github_summary:
483+
gh_summary.append("")
484+
gh_summary.append("### Regressions")
482485
gh_summary.append("No unexpected regressions found!")
483-
with open(options.github_summary_filename, "w") as f:
486+
with open(options.github_summary_regression_filename, "w") as f:
484487
f.write("\n".join(gh_summary))
485488

486489
else:

devops/scripts/benchmarks/main.py

Lines changed: 122 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,67 @@
3232
from presets import enabled_suites, presets
3333

3434

35+
def generate_github_summary(execution_stats, failures):
36+
"""Generate GitHub workflow summary with execution statistics"""
37+
gh_summary: list[str] = []
38+
gh_summary.append("### Benchmarks Execution")
39+
40+
# Overall statistics
41+
total_tests = execution_stats["total_tests"]
42+
passed_tests = execution_stats["tests_passed"]
43+
failed_tests = execution_stats["tests_failed"]
44+
warnings = execution_stats["warnings"]
45+
errors = len(failures)
46+
47+
gh_summary.append("#### Overall Statistics")
48+
gh_summary.append(f"- **Total Number of benchmarks:** {total_tests}")
49+
gh_summary.append(f"- **Tests Passed:** {passed_tests}")
50+
gh_summary.append(f"- **Tests Failed:** {failed_tests}")
51+
gh_summary.append(f"- **Errors:** {errors}")
52+
gh_summary.append(f"- **Warnings:** {warnings}")
53+
gh_summary.append("")
54+
55+
# Overall status of execution
56+
if failed_tests == 0 and errors == 0:
57+
gh_summary.append("#### ✅ Status: SUCCESS")
58+
gh_summary.append("Benchmarks seem to have executed successfully!")
59+
elif failed_tests > 0 or errors > 0:
60+
gh_summary.append("#### ❌ Status: FAILURES DETECTED")
61+
gh_summary.append("Some benchmarks failed or encountered errors.")
62+
63+
if warnings > 0:
64+
gh_summary.append("#### ⚠️ Status: WARNINGS DETECTED")
65+
gh_summary.append("Some benchmarks executed with warnings.")
66+
67+
gh_summary.append("")
68+
69+
# Detailed failures info
70+
if failures:
71+
gh_summary.append("#### Failure Details")
72+
gh_summary.append(
73+
f"<details><summary>{len(failures)} failed benchmarks:</summary>"
74+
)
75+
gh_summary.append("")
76+
77+
for benchmark_name, failure_reason in failures.items():
78+
gh_summary.append(f"##### {benchmark_name}")
79+
gh_summary.append(f"- **Reason:** {failure_reason}")
80+
gh_summary.append("")
81+
82+
gh_summary.append("</details>")
83+
gh_summary.append("")
84+
85+
# Write the summary to file
86+
try:
87+
with open(options.github_summary_execution_filename, "w") as f:
88+
f.write("\n".join(gh_summary))
89+
log.info(
90+
f"GitHub summary with execution stats written to {options.github_summary_execution_filename}"
91+
)
92+
except Exception as e:
93+
log.error(f"Failed to write GitHub summary with execution stats: {e}")
94+
95+
3596
def run_iterations(
3697
benchmark: Benchmark,
3798
env_vars,
@@ -40,7 +101,12 @@ def run_iterations(
40101
failures: dict[str, str],
41102
run_trace: TracingType = TracingType.NONE,
42103
force_trace: bool = False,
43-
):
104+
) -> bool:
105+
"""
106+
Returns True if all iterations completed successfully, False otherwise.
107+
Unless options.exit_on_failure is set, then exception is raised.
108+
"""
109+
44110
for iter in range(iters):
45111
log.info(f"running {benchmark.name()}, iteration {iter}... ")
46112
try:
@@ -49,10 +115,10 @@ def run_iterations(
49115
)
50116
if bench_results is None:
51117
if options.exit_on_failure:
52-
raise RuntimeError(f"Benchmark produced no results!")
118+
raise RuntimeError("Benchmark produced no results!")
53119
else:
54120
failures[benchmark.name()] = "benchmark produced no results!"
55-
break
121+
return False
56122

57123
for bench_result in bench_results:
58124
log.info(
@@ -73,10 +139,15 @@ def run_iterations(
73139
f"Benchmark failed: {failure_label} verification failed: {str(e)}"
74140
)
75141
else:
76-
failures[failure_label] = f"verification failed: {str(e)}"
77-
log.error(f"complete ({failure_label}: verification failed: {str(e)}).")
142+
failures[failure_label] = (
143+
f"{failure_label}: verification failed: {str(e)}"
144+
)
145+
log.error(f"{failure_label}: verification failed: {str(e)}.")
78146
continue
79147

148+
# Iterations completed successfully
149+
return True
150+
80151

81152
# https://www.statology.org/modified-z-score/
82153
def modified_z_score(values: list[float]) -> list[float]:
@@ -110,7 +181,7 @@ def remove_outliers(
110181

111182

112183
def process_results(
113-
results: dict[str, list[Result]], stddev_threshold_override
184+
results: dict[str, list[Result]], stddev_threshold_override, execution_stats
114185
) -> tuple[bool, list[Result]]:
115186
processed: list[Result] = []
116187
# technically, we can detect whether result is below or above threshold per
@@ -142,6 +213,7 @@ def process_results(
142213
log.warning(
143214
f"stddev {stddev} above the threshold {threshold_scaled} ({threshold} times {mean_value}) for {label}"
144215
)
216+
execution_stats["warnings"] += 1
145217
valid_results = False
146218

147219
rlist.sort(key=lambda res: res.value)
@@ -170,7 +242,7 @@ def collect_metadata(suites):
170242
return metadata
171243

172244

173-
def main(directory, additional_env_vars, compare_names, filter):
245+
def main(directory, additional_env_vars, compare_names, filter, execution_stats):
174246
prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
175247

176248
if options.dry_run:
@@ -218,7 +290,7 @@ def main(directory, additional_env_vars, compare_names, filter):
218290

219291
# TODO: rename "s", rename setup in suite to suite_setup, rename setup in benchmark to benchmark_setup
220292
# TODO: do not add benchmarks whose suite setup failed
221-
# TODO: add a mode where we fail etire script in case of setup (or other) failures and use in CI
293+
# TODO: add a mode where we fail entire script in case of setup (or other) failures and use in CI
222294

223295
for s in suites:
224296
if s.name() not in enabled_suites(options.preset):
@@ -246,9 +318,9 @@ def main(directory, additional_env_vars, compare_names, filter):
246318
except Exception as e:
247319
if options.exit_on_failure:
248320
raise e
249-
failures[s.name()] = f"Suite setup failure: {e}"
321+
failures[s.name()] = f"Suite '{s.name()}' setup failure: {e}"
250322
log.error(
251-
f"{type(s).__name__} setup failed. Benchmarks won't be added."
323+
f"Suite {type(s).__name__} setup failed. Benchmarks won't be added."
252324
)
253325
log.error(f"failed: {e}")
254326
else:
@@ -265,19 +337,23 @@ def main(directory, additional_env_vars, compare_names, filter):
265337
if options.exit_on_failure:
266338
raise e
267339
else:
268-
failures[benchmark.name()] = f"Benchmark setup failure: {e}"
340+
failures[benchmark.name()] = (
341+
f"Benchmark '{benchmark.name()}' setup failure: {e}"
342+
)
269343
log.error(f"failed: {e}")
270344

271345
results = []
272346
if benchmarks:
273347
log.info(f"Running {len(benchmarks)} benchmarks...")
348+
execution_stats["total_tests"] = len(benchmarks)
274349
elif not options.dry_run:
275350
raise RuntimeError("No benchmarks to run.")
276351
for benchmark in benchmarks:
277352
try:
278353
merged_env_vars = {**additional_env_vars}
279354
intermediate_results: dict[str, list[Result]] = {}
280355
processed: list[Result] = []
356+
iterations_rc = False
281357

282358
# Determine if we should run regular benchmarks
283359
# Run regular benchmarks if:
@@ -292,7 +368,7 @@ def main(directory, additional_env_vars, compare_names, filter):
292368

293369
if should_run_regular:
294370
for _ in range(options.iterations_stddev):
295-
run_iterations(
371+
iterations_rc = run_iterations(
296372
benchmark,
297373
merged_env_vars,
298374
options.iterations,
@@ -301,7 +377,9 @@ def main(directory, additional_env_vars, compare_names, filter):
301377
run_trace=TracingType.NONE,
302378
)
303379
valid, processed = process_results(
304-
intermediate_results, benchmark.stddev_threshold()
380+
intermediate_results,
381+
benchmark.stddev_threshold(),
382+
execution_stats,
305383
)
306384
if valid:
307385
break
@@ -310,7 +388,7 @@ def main(directory, additional_env_vars, compare_names, filter):
310388
if options.unitrace and (
311389
benchmark.traceable(TracingType.UNITRACE) or args.unitrace == "force"
312390
):
313-
run_iterations(
391+
iterations_rc = run_iterations(
314392
benchmark,
315393
merged_env_vars,
316394
1,
@@ -324,7 +402,7 @@ def main(directory, additional_env_vars, compare_names, filter):
324402
benchmark.traceable(TracingType.FLAMEGRAPH)
325403
or args.flamegraph == "force"
326404
):
327-
run_iterations(
405+
iterations_rc = run_iterations(
328406
benchmark,
329407
merged_env_vars,
330408
1,
@@ -335,11 +413,18 @@ def main(directory, additional_env_vars, compare_names, filter):
335413
)
336414

337415
results += processed
416+
if iterations_rc:
417+
execution_stats["tests_passed"] += 1
418+
else:
419+
execution_stats["tests_failed"] += 1
338420
except Exception as e:
421+
execution_stats["tests_failed"] += 1
339422
if options.exit_on_failure:
340423
raise e
341424
else:
342-
failures[benchmark.name()] = f"Benchmark run failure: {e}"
425+
failures[benchmark.name()] = (
426+
f"Benchmark '{benchmark.name()}' run failure: {e}"
427+
)
343428
log.error(f"failed: {e}")
344429

345430
this_name = options.current_run_name
@@ -408,6 +493,10 @@ def main(directory, additional_env_vars, compare_names, filter):
408493
generate_html(history, compare_names, html_path, metadata)
409494
log.info(f"HTML with benchmark results has been generated")
410495

496+
# Generate GitHub summary
497+
if options.produce_github_summary:
498+
generate_github_summary(execution_stats, failures)
499+
411500
if options.exit_on_failure and failures:
412501
# just in case code missed to raise earlier
413502
raise RuntimeError(str(failures))
@@ -691,6 +780,12 @@ def validate_and_parse_env_args(env_args):
691780
help="Set the logging level",
692781
default="info",
693782
)
783+
parser.add_argument(
784+
"--produce-github-summary",
785+
action="store_true",
786+
help=f"Produce execution stats summary for Github workflow, in file '{options.github_summary_execution_filename}'.",
787+
default=False,
788+
)
694789

695790
args = parser.parse_args()
696791
additional_env_vars = validate_and_parse_env_args(args.env)
@@ -724,6 +819,7 @@ def validate_and_parse_env_args(env_args):
724819
options.flamegraph = args.flamegraph is not None
725820
options.archive_baseline_days = args.archive_baseline_after
726821
options.archive_pr_days = args.archive_pr_after
822+
options.produce_github_summary = args.produce_github_summary
727823

728824
# Initialize logger with command line arguments
729825
initialize_logger(args.verbose, args.log_level)
@@ -738,6 +834,14 @@ def validate_and_parse_env_args(env_args):
738834
parser.error("Specified --output-dir is not a valid path")
739835
options.output_directory = os.path.abspath(args.output_dir)
740836

837+
# Initialize GitHub summary tracking
838+
execution_stats = {
839+
"total_tests": 0,
840+
"tests_passed": 0,
841+
"tests_failed": 0,
842+
"warnings": 0,
843+
}
844+
741845
# Options intended for CI:
742846
options.timestamp_override = args.timestamp_override
743847
if args.results_dir is not None:
@@ -780,6 +884,7 @@ def validate_and_parse_env_args(env_args):
780884
options.device_architecture = ""
781885
log.warning(f"Failed to fetch device architecture: {e}")
782886
log.warning("Defaulting to generic benchmark parameters.")
887+
execution_stats["warnings"] += 1
783888

784889
log.info(f"Selected device architecture: {options.device_architecture}")
785890

@@ -788,4 +893,5 @@ def validate_and_parse_env_args(env_args):
788893
additional_env_vars,
789894
args.compare,
790895
benchmark_filter,
896+
execution_stats,
791897
)

devops/scripts/benchmarks/options.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,10 @@ class Options:
9797
# CI scripts vs SYCl build source.
9898
github_repo_override: str = None
9999
git_commit_override: str = None
100-
# Filename used to store Github summary files:
101-
github_summary_filename: str = "github_summary.md"
100+
# Flag and filenames used to store Github summary files:
101+
produce_github_summary: bool = False
102+
github_summary_execution_filename: str = "github_summary_exe.md"
103+
github_summary_regression_filename: str = "github_summary_reg.md"
102104
# Archiving settings
103105
# Archived runs are stored separately from the main dataset but are still accessible
104106
# via the HTML UI when "Include archived runs" is enabled.

0 commit comments

Comments
 (0)