3232from presets import enabled_suites , presets
3333
3434
35+ def generate_github_summary (execution_stats , failures ):
36+ """Generate GitHub workflow summary with execution statistics"""
37+ gh_summary : list [str ] = []
38+ gh_summary .append ("### Benchmarks Execution" )
39+
40+ # Overall statistics
41+ total_tests = execution_stats ["total_tests" ]
42+ passed_tests = execution_stats ["tests_passed" ]
43+ failed_tests = execution_stats ["tests_failed" ]
44+ warnings = execution_stats ["warnings" ]
45+ errors = len (failures )
46+
47+ gh_summary .append ("#### Overall Statistics" )
48+ gh_summary .append (f"- **Total Number of benchmarks:** { total_tests } " )
49+ gh_summary .append (f"- **Tests Passed:** { passed_tests } " )
50+ gh_summary .append (f"- **Tests Failed:** { failed_tests } " )
51+ gh_summary .append (f"- **Errors:** { errors } " )
52+ gh_summary .append (f"- **Warnings:** { warnings } " )
53+ gh_summary .append ("" )
54+
55+ # Overall status of execution
56+ if failed_tests == 0 and errors == 0 :
57+ gh_summary .append ("#### ✅ Status: SUCCESS" )
58+ gh_summary .append ("Benchmarks seem to have executed successfully!" )
59+ elif failed_tests > 0 or errors > 0 :
60+ gh_summary .append ("#### ❌ Status: FAILURES DETECTED" )
61+ gh_summary .append ("Some benchmarks failed or encountered errors." )
62+
63+ if warnings > 0 :
64+ gh_summary .append ("#### ⚠️ Status: WARNINGS DETECTED" )
65+ gh_summary .append ("Some benchmarks executed with warnings." )
66+
67+ gh_summary .append ("" )
68+
69+ # Detailed failures info
70+ if failures :
71+ gh_summary .append ("#### Failure Details" )
72+ gh_summary .append (
73+ f"<details><summary>{ len (failures )} failed benchmarks:</summary>"
74+ )
75+ gh_summary .append ("" )
76+
77+ for benchmark_name , failure_reason in failures .items ():
78+ gh_summary .append (f"##### { benchmark_name } " )
79+ gh_summary .append (f"- **Reason:** { failure_reason } " )
80+ gh_summary .append ("" )
81+
82+ gh_summary .append ("</details>" )
83+ gh_summary .append ("" )
84+
85+ # Write the summary to file
86+ try :
87+ with open (options .github_summary_execution_filename , "w" ) as f :
88+ f .write ("\n " .join (gh_summary ))
89+ log .info (
90+ f"GitHub summary with execution stats written to { options .github_summary_execution_filename } "
91+ )
92+ except Exception as e :
93+ log .error (f"Failed to write GitHub summary with execution stats: { e } " )
94+
95+
3596def run_iterations (
3697 benchmark : Benchmark ,
3798 env_vars ,
@@ -40,7 +101,12 @@ def run_iterations(
40101 failures : dict [str , str ],
41102 run_trace : TracingType = TracingType .NONE ,
42103 force_trace : bool = False ,
43- ):
104+ ) -> bool :
105+ """
106+ Returns True if all iterations completed successfully, False otherwise.
107+ Unless options.exit_on_failure is set, then exception is raised.
108+ """
109+
44110 for iter in range (iters ):
45111 log .info (f"running { benchmark .name ()} , iteration { iter } ... " )
46112 try :
@@ -49,10 +115,10 @@ def run_iterations(
49115 )
50116 if bench_results is None :
51117 if options .exit_on_failure :
52- raise RuntimeError (f "Benchmark produced no results!" )
118+ raise RuntimeError ("Benchmark produced no results!" )
53119 else :
54120 failures [benchmark .name ()] = "benchmark produced no results!"
55- break
121+ return False
56122
57123 for bench_result in bench_results :
58124 log .info (
@@ -73,10 +139,15 @@ def run_iterations(
73139 f"Benchmark failed: { failure_label } verification failed: { str (e )} "
74140 )
75141 else :
76- failures [failure_label ] = f"verification failed: { str (e )} "
77- log .error (f"complete ({ failure_label } : verification failed: { str (e )} )." )
142+ failures [failure_label ] = (
143+ f"{ failure_label } : verification failed: { str (e )} "
144+ )
145+ log .error (f"{ failure_label } : verification failed: { str (e )} ." )
78146 continue
79147
148+ # Iterations completed successfully
149+ return True
150+
80151
81152# https://www.statology.org/modified-z-score/
82153def modified_z_score (values : list [float ]) -> list [float ]:
@@ -110,7 +181,7 @@ def remove_outliers(
110181
111182
112183def process_results (
113- results : dict [str , list [Result ]], stddev_threshold_override
184+ results : dict [str , list [Result ]], stddev_threshold_override , execution_stats
114185) -> tuple [bool , list [Result ]]:
115186 processed : list [Result ] = []
116187 # technically, we can detect whether result is below or above threshold per
@@ -142,6 +213,7 @@ def process_results(
142213 log .warning (
143214 f"stddev { stddev } above the threshold { threshold_scaled } ({ threshold } times { mean_value } ) for { label } "
144215 )
216+ execution_stats ["warnings" ] += 1
145217 valid_results = False
146218
147219 rlist .sort (key = lambda res : res .value )
@@ -170,7 +242,7 @@ def collect_metadata(suites):
170242 return metadata
171243
172244
173- def main (directory , additional_env_vars , compare_names , filter ):
245+ def main (directory , additional_env_vars , compare_names , filter , execution_stats ):
174246 prepare_workdir (directory , INTERNAL_WORKDIR_VERSION )
175247
176248 if options .dry_run :
@@ -218,7 +290,7 @@ def main(directory, additional_env_vars, compare_names, filter):
218290
219291 # TODO: rename "s", rename setup in suite to suite_setup, rename setup in benchmark to benchmark_setup
220292 # TODO: do not add benchmarks whose suite setup failed
221- # TODO: add a mode where we fail etire script in case of setup (or other) failures and use in CI
293+ # TODO: add a mode where we fail entire script in case of setup (or other) failures and use in CI
222294
223295 for s in suites :
224296 if s .name () not in enabled_suites (options .preset ):
@@ -246,9 +318,9 @@ def main(directory, additional_env_vars, compare_names, filter):
246318 except Exception as e :
247319 if options .exit_on_failure :
248320 raise e
249- failures [s .name ()] = f"Suite setup failure: { e } "
321+ failures [s .name ()] = f"Suite ' { s . name () } ' setup failure: { e } "
250322 log .error (
251- f"{ type (s ).__name__ } setup failed. Benchmarks won't be added."
323+ f"Suite { type (s ).__name__ } setup failed. Benchmarks won't be added."
252324 )
253325 log .error (f"failed: { e } " )
254326 else :
@@ -265,19 +337,23 @@ def main(directory, additional_env_vars, compare_names, filter):
265337 if options .exit_on_failure :
266338 raise e
267339 else :
268- failures [benchmark .name ()] = f"Benchmark setup failure: { e } "
340+ failures [benchmark .name ()] = (
341+ f"Benchmark '{ benchmark .name ()} ' setup failure: { e } "
342+ )
269343 log .error (f"failed: { e } " )
270344
271345 results = []
272346 if benchmarks :
273347 log .info (f"Running { len (benchmarks )} benchmarks..." )
348+ execution_stats ["total_tests" ] = len (benchmarks )
274349 elif not options .dry_run :
275350 raise RuntimeError ("No benchmarks to run." )
276351 for benchmark in benchmarks :
277352 try :
278353 merged_env_vars = {** additional_env_vars }
279354 intermediate_results : dict [str , list [Result ]] = {}
280355 processed : list [Result ] = []
356+ iterations_rc = False
281357
282358 # Determine if we should run regular benchmarks
283359 # Run regular benchmarks if:
@@ -292,7 +368,7 @@ def main(directory, additional_env_vars, compare_names, filter):
292368
293369 if should_run_regular :
294370 for _ in range (options .iterations_stddev ):
295- run_iterations (
371+ iterations_rc = run_iterations (
296372 benchmark ,
297373 merged_env_vars ,
298374 options .iterations ,
@@ -301,7 +377,9 @@ def main(directory, additional_env_vars, compare_names, filter):
301377 run_trace = TracingType .NONE ,
302378 )
303379 valid , processed = process_results (
304- intermediate_results , benchmark .stddev_threshold ()
380+ intermediate_results ,
381+ benchmark .stddev_threshold (),
382+ execution_stats ,
305383 )
306384 if valid :
307385 break
@@ -310,7 +388,7 @@ def main(directory, additional_env_vars, compare_names, filter):
310388 if options .unitrace and (
311389 benchmark .traceable (TracingType .UNITRACE ) or args .unitrace == "force"
312390 ):
313- run_iterations (
391+ iterations_rc = run_iterations (
314392 benchmark ,
315393 merged_env_vars ,
316394 1 ,
@@ -324,7 +402,7 @@ def main(directory, additional_env_vars, compare_names, filter):
324402 benchmark .traceable (TracingType .FLAMEGRAPH )
325403 or args .flamegraph == "force"
326404 ):
327- run_iterations (
405+ iterations_rc = run_iterations (
328406 benchmark ,
329407 merged_env_vars ,
330408 1 ,
@@ -335,11 +413,18 @@ def main(directory, additional_env_vars, compare_names, filter):
335413 )
336414
337415 results += processed
416+ if iterations_rc :
417+ execution_stats ["tests_passed" ] += 1
418+ else :
419+ execution_stats ["tests_failed" ] += 1
338420 except Exception as e :
421+ execution_stats ["tests_failed" ] += 1
339422 if options .exit_on_failure :
340423 raise e
341424 else :
342- failures [benchmark .name ()] = f"Benchmark run failure: { e } "
425+ failures [benchmark .name ()] = (
426+ f"Benchmark '{ benchmark .name ()} ' run failure: { e } "
427+ )
343428 log .error (f"failed: { e } " )
344429
345430 this_name = options .current_run_name
@@ -408,6 +493,10 @@ def main(directory, additional_env_vars, compare_names, filter):
408493 generate_html (history , compare_names , html_path , metadata )
409494 log .info (f"HTML with benchmark results has been generated" )
410495
496+ # Generate GitHub summary
497+ if options .produce_github_summary :
498+ generate_github_summary (execution_stats , failures )
499+
411500 if options .exit_on_failure and failures :
412501 # just in case code missed to raise earlier
413502 raise RuntimeError (str (failures ))
@@ -691,6 +780,12 @@ def validate_and_parse_env_args(env_args):
691780 help = "Set the logging level" ,
692781 default = "info" ,
693782 )
783+ parser .add_argument (
784+ "--produce-github-summary" ,
785+ action = "store_true" ,
786+ help = f"Produce execution stats summary for Github workflow, in file '{ options .github_summary_execution_filename } '." ,
787+ default = False ,
788+ )
694789
695790 args = parser .parse_args ()
696791 additional_env_vars = validate_and_parse_env_args (args .env )
@@ -724,6 +819,7 @@ def validate_and_parse_env_args(env_args):
724819 options .flamegraph = args .flamegraph is not None
725820 options .archive_baseline_days = args .archive_baseline_after
726821 options .archive_pr_days = args .archive_pr_after
822+ options .produce_github_summary = args .produce_github_summary
727823
728824 # Initialize logger with command line arguments
729825 initialize_logger (args .verbose , args .log_level )
@@ -738,6 +834,14 @@ def validate_and_parse_env_args(env_args):
738834 parser .error ("Specified --output-dir is not a valid path" )
739835 options .output_directory = os .path .abspath (args .output_dir )
740836
837+ # Initialize GitHub summary tracking
838+ execution_stats = {
839+ "total_tests" : 0 ,
840+ "tests_passed" : 0 ,
841+ "tests_failed" : 0 ,
842+ "warnings" : 0 ,
843+ }
844+
741845 # Options intended for CI:
742846 options .timestamp_override = args .timestamp_override
743847 if args .results_dir is not None :
@@ -780,6 +884,7 @@ def validate_and_parse_env_args(env_args):
780884 options .device_architecture = ""
781885 log .warning (f"Failed to fetch device architecture: { e } " )
782886 log .warning ("Defaulting to generic benchmark parameters." )
887+ execution_stats ["warnings" ] += 1
783888
784889 log .info (f"Selected device architecture: { options .device_architecture } " )
785890
@@ -788,4 +893,5 @@ def validate_and_parse_env_args(env_args):
788893 additional_env_vars ,
789894 args .compare ,
790895 benchmark_filter ,
896+ execution_stats ,
791897 )
0 commit comments