Add ConsistencyCheckError exception and handle consistency checks in benchmarks.

superdosh · superdosh · commit e7d824d54a0b · 2026-02-05T12:26:41.000-05:00
diff --git a/src/modelbench/cli.py b/src/modelbench/cli.py
@@ -24,7 +24,11 @@
 import modelgauge.annotators.cheval.registration  # noqa: F401
 from modelbench.benchmark_runner import BenchmarkRun, BenchmarkRunner, JsonRunTracker, TqdmRunTracker
 from modelbench.benchmarks import GeneralPurposeAiChatBenchmarkV1, SecurityBenchmark
-from modelbench.consistency_checker import ConsistencyChecker, summarize_consistency_check_results
+from modelbench.consistency_checker import (
+    ConsistencyCheckError,
+    ConsistencyChecker,
+    summarize_consistency_check_results,
+)
 from modelbench.record import dump_json
 from modelbench.standards import Standards
 from modelgauge.config import load_secrets_from_config, write_default_config
@@ -188,7 +192,11 @@ def general_benchmark(
     sut = make_sut(sut_uid)
     benchmark = GeneralPurposeAiChatBenchmarkV1(locale, prompt_set, evaluator)
     check_benchmark(benchmark)
-    run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, run_path, output_dir, run_uid, user)
+    try:
+        run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, run_path, output_dir, run_uid, user)
+    except ConsistencyCheckError as e:
+        echo(termcolor.colored(str(e), "red"), err=True)
+        sys.exit(e.EXIT_CODE)
 
 
 @benchmark.command("security", help="run a security benchmark")
@@ -211,7 +219,11 @@ def security_benchmark(
     sut = make_sut(sut_uid)
     benchmark = SecurityBenchmark(locale, prompt_set, evaluator=evaluator)
     check_benchmark(benchmark)
-    run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, run_path, output_dir, run_uid, user)
+    try:
+        run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, run_path, output_dir, run_uid, user)
+    except ConsistencyCheckError as e:
+        echo(termcolor.colored(str(e), "red"), err=True)
+        sys.exit(e.EXIT_CODE)
 
 
 def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, run_path, outputdir, run_uid, user):
@@ -233,7 +245,9 @@ def run_and_report_benchmark(benchmark, sut, max_instances, debug, json_logs, ru
         annotation_records.write(json.dumps(annotations))
     print(f"Wrote annotations for {benchmark.uid} to {annotation_path}.")
 
-    run_consistency_check(run.journal_path, verbose=True)
+    consistent = run_consistency_check(run.journal_path, verbose=True)
+    if not consistent:
+        raise ConsistencyCheckError("Consistency check failed for the benchmark run.")
 
 
 @cli.command(
diff --git a/src/modelbench/consistency_checker.py b/src/modelbench/consistency_checker.py
@@ -620,3 +620,7 @@ def summarize_consistency_check_results(checkers: List[ConsistencyChecker]):
 
     console = Console()
     console.print(table)
+
+
+class ConsistencyCheckError(Exception):
+    EXIT_CODE = 2
diff --git a/tests/modelbench_tests/test_run.py b/tests/modelbench_tests/test_run.py
@@ -21,6 +21,7 @@
     SecurityScore,
 )
 from modelbench.cli import cli
+from modelbench.consistency_checker import ConsistencyCheckError
 from modelbench.hazards import HazardDefinition, HazardScore, SafeHazardV1, Standards
 from modelbench.scoring import ValueEstimate
 from modelbench.standards import NoStandardsFileError, OverwriteStandardsFileError
@@ -318,6 +319,7 @@ def invoke(command, args=None, **kwargs):
     @pytest.mark.parametrize("sut_uid", ["fake-sut", "google/gemma-3-27b-it:scaleway:hfrelay"])
     def test_benchmark_basic_run_produces_json(
         self,
+        monkeypatch,
         runner,
         mock_run_benchmarks,
         mock_score_benchmarks,
@@ -346,6 +348,7 @@ def test_benchmark_basic_run_produces_json(
             sut_uid,
             *benchmark_options,
         ]
+        monkeypatch.setattr(modelbench.cli, "run_consistency_check", lambda *args, **kwargs: True)
         result = runner(
             cli,
             command_options,
@@ -411,6 +414,7 @@ def test_benchmark_multiple_suts_produces_json(
 
         mock = MagicMock(return_value=[self.mock_score(sut_uid, benchmark), self.mock_score("demo_yes_no", benchmark)])
         monkeypatch.setattr(modelbench.cli, "score_benchmarks", mock)
+        monkeypatch.setattr(modelbench.cli, "run_consistency_check", lambda *args, **kwargs: True)
 
         result = runner(
             cli,
@@ -430,6 +434,25 @@ def test_benchmark_multiple_suts_produces_json(
         assert result.exit_code == 0
         assert (run_dir / "records" / f"benchmark_record-{benchmark.uid}.json").exists
 
+    @pytest.mark.parametrize("benchmark_type", ["general", "security"])
+    def test_general_benchmark_exits_when_consistency_fails(self, runner, benchmark_type, sut, monkeypatch):
+        fail_run = MagicMock(side_effect=ConsistencyCheckError("consistency failed"))
+        monkeypatch.setattr(modelbench.cli, "run_and_report_benchmark", fail_run)
+
+        result = runner(
+            cli,
+            [
+                "benchmark",
+                benchmark_type,
+                "--sut",
+                sut.uid,
+            ],
+            catch_exceptions=False,
+        )
+
+        fail_run.assert_called_once()
+        assert result.exit_code == ConsistencyCheckError.EXIT_CODE
+
     def test_benchmark_bad_sut_errors_out(self, runner):
         benchmark_options = ["--version", "1.1"]
         benchmark_options.extend(["--locale", "en_us"])