2424import modelgauge .annotators .cheval .registration # noqa: F401
2525from modelbench .benchmark_runner import BenchmarkRun , BenchmarkRunner , JsonRunTracker , TqdmRunTracker
2626from modelbench .benchmarks import GeneralPurposeAiChatBenchmarkV1 , SecurityBenchmark
27- from modelbench .consistency_checker import ConsistencyChecker , summarize_consistency_check_results
27+ from modelbench .consistency_checker import (
28+ ConsistencyCheckError ,
29+ ConsistencyChecker ,
30+ summarize_consistency_check_results ,
31+ )
2832from modelbench .record import dump_json
2933from modelbench .standards import Standards
3034from modelgauge .config import load_secrets_from_config , write_default_config
@@ -188,7 +192,11 @@ def general_benchmark(
188192 sut = make_sut (sut_uid )
189193 benchmark = GeneralPurposeAiChatBenchmarkV1 (locale , prompt_set , evaluator )
190194 check_benchmark (benchmark )
191- run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , run_path , output_dir , run_uid , user )
195+ try :
196+ run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , run_path , output_dir , run_uid , user )
197+ except ConsistencyCheckError as e :
198+ echo (termcolor .colored (str (e ), "red" ), err = True )
199+ sys .exit (e .EXIT_CODE )
192200
193201
194202@benchmark .command ("security" , help = "run a security benchmark" )
@@ -211,7 +219,11 @@ def security_benchmark(
211219 sut = make_sut (sut_uid )
212220 benchmark = SecurityBenchmark (locale , prompt_set , evaluator = evaluator )
213221 check_benchmark (benchmark )
214- run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , run_path , output_dir , run_uid , user )
222+ try :
223+ run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , run_path , output_dir , run_uid , user )
224+ except ConsistencyCheckError as e :
225+ echo (termcolor .colored (str (e ), "red" ), err = True )
226+ sys .exit (e .EXIT_CODE )
215227
216228
217229def run_and_report_benchmark (benchmark , sut , max_instances , debug , json_logs , run_path , outputdir , run_uid , user ):
@@ -233,7 +245,9 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ru
233245 annotation_records .write (json .dumps (annotations ))
234246 print (f"Wrote annotations for { benchmark .uid } to { annotation_path } ." )
235247
236- run_consistency_check (run .journal_path , verbose = True )
248+ consistent = run_consistency_check (run .journal_path , verbose = True )
249+ if not consistent :
250+ raise ConsistencyCheckError ("Consistency check failed for the benchmark run." )
237251
238252
239253@cli .command (
0 commit comments