55 python -m cookbooks.zero_shot_evaluation --config config.yaml
66 python -m cookbooks.zero_shot_evaluation --config config.yaml --save
77 python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save
8+ python -m cookbooks.zero_shot_evaluation --config config.yaml --rerun-judge --save
89"""
910
1011import asyncio
@@ -28,6 +29,43 @@ def _load_queries_from_file(queries_file: str) -> List[GeneratedQuery]:
2829 return queries
2930
3031
32+ def _clear_judge_results (output_dir : str ) -> bool :
33+ """Clear only comparison results to re-run with new judge model.
34+
35+ Keeps queries, responses, and rubrics intact.
36+ Only removes comparison_details.json and resets checkpoint to RUBRICS_GENERATED.
37+
38+ Returns:
39+ True if successfully cleared, False if no checkpoint exists
40+ """
41+ output_path = Path (output_dir )
42+ checkpoint_file = output_path / "checkpoint.json"
43+ details_file = output_path / "comparison_details.json"
44+
45+ if not checkpoint_file .exists ():
46+ logger .warning ("No checkpoint found, nothing to clear" )
47+ return False
48+
49+ # Remove comparison details
50+ if details_file .exists ():
51+ details_file .unlink ()
52+ logger .info (f"Removed { details_file } " )
53+
54+ # Update checkpoint to RUBRICS_GENERATED stage
55+ with open (checkpoint_file , "r" , encoding = "utf-8" ) as f :
56+ checkpoint = json .load (f )
57+
58+ checkpoint ["stage" ] = "rubrics_generated"
59+ checkpoint ["evaluated_pairs" ] = 0
60+ checkpoint ["total_pairs" ] = 0
61+
62+ with open (checkpoint_file , "w" , encoding = "utf-8" ) as f :
63+ json .dump (checkpoint , f , indent = 2 , ensure_ascii = False )
64+
65+ logger .info ("Reset checkpoint to RUBRICS_GENERATED stage (will re-run pairwise evaluation)" )
66+ return True
67+
68+
3169async def _run_evaluation (
3270 config_path : str ,
3371 output_dir : Optional [str ] = None ,
@@ -67,6 +105,7 @@ def main(
67105 queries_file : Optional [str ] = None ,
68106 save : bool = False ,
69107 fresh : bool = False ,
108+ rerun_judge : bool = False ,
70109) -> None :
71110 """Zero-shot evaluation CLI with checkpoint support.
72111
@@ -76,6 +115,8 @@ def main(
76115 queries_file: Path to pre-generated queries JSON (skip query generation)
77116 save: Whether to save results to file
78117 fresh: Start fresh, ignore any existing checkpoint
118+ rerun_judge: Re-run only pairwise evaluation with new judge model
119+ (keeps queries, responses, and rubrics)
79120
80121 Examples:
81122 # Normal run (auto-resumes from checkpoint)
@@ -86,6 +127,9 @@ def main(
86127
87128 # Start fresh, ignore checkpoint
88129 python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save
130+
131+ # Re-run with new judge model (keeps queries/responses/rubrics)
132+ python -m cookbooks.zero_shot_evaluation --config config.yaml --rerun-judge --save
89133 """
90134 config_path = Path (config )
91135 if not config_path .exists ():
@@ -98,15 +142,27 @@ def main(
98142 logger .error (f"Queries file not found: { queries_file } " )
99143 return
100144
101- logger .info (f"Starting zero-shot evaluation with config: { config } " )
102- if queries_file :
103- logger .info (f"Using pre-generated queries from: { queries_file } " )
104- if fresh :
145+ # Load config to get output_dir
146+ loaded_config = load_config (str (config_path ))
147+ effective_output_dir = output_dir or loaded_config .output .output_dir
148+
149+ # Handle rerun_judge and fresh flags
150+ if rerun_judge :
151+ if fresh :
152+ logger .warning ("--rerun-judge and --fresh are mutually exclusive, using --rerun-judge" )
153+ logger .info ("Re-running pairwise evaluation with new judge model..." )
154+ if not _clear_judge_results (effective_output_dir ):
155+ logger .info ("No previous results found, will run full evaluation" )
156+ elif fresh :
105157 logger .info ("Starting fresh (ignoring checkpoint)" )
106158 else :
107159 logger .info ("Resume mode enabled (will continue from checkpoint if exists)" )
108160
109- asyncio .run (_run_evaluation (str (config_path ), output_dir , queries_file , save , resume = not fresh ))
161+ logger .info (f"Starting zero-shot evaluation with config: { config } " )
162+ if queries_file :
163+ logger .info (f"Using pre-generated queries from: { queries_file } " )
164+
165+ asyncio .run (_run_evaluation (str (config_path ), output_dir , queries_file , save , resume = not fresh or rerun_judge ))
110166
111167
112168if __name__ == "__main__" :
0 commit comments