Skip to content

Commit 59471da

Browse files
authored
feat(zero_shot): enhance evaluation pipeline with rerun-judge and cha… (#63)
* feat(zero_shot): enhance evaluation pipeline with rerun-judge and chart improvements - Add --rerun-judge flag to re-run pairwise evaluation with different judge model - Enhance chart_generator with additional visualization options - Update schema with new configuration options - Improve zero_shot_pipeline with better checkpoint management - Fix minor issue in openai_chat_model * fix: address code review comments - Use model.lower() for case-insensitive model name matching in openai_chat_model.py - Consolidate rerun_judge and fresh flag handling logic in __main__.py
1 parent e5a2879 commit 59471da

File tree

5 files changed

+788
-81
lines changed

5 files changed

+788
-81
lines changed

cookbooks/zero_shot_evaluation/__main__.py

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
python -m cookbooks.zero_shot_evaluation --config config.yaml
66
python -m cookbooks.zero_shot_evaluation --config config.yaml --save
77
python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save
8+
python -m cookbooks.zero_shot_evaluation --config config.yaml --rerun-judge --save
89
"""
910

1011
import asyncio
@@ -28,6 +29,43 @@ def _load_queries_from_file(queries_file: str) -> List[GeneratedQuery]:
2829
return queries
2930

3031

32+
def _clear_judge_results(output_dir: str) -> bool:
33+
"""Clear only comparison results to re-run with new judge model.
34+
35+
Keeps queries, responses, and rubrics intact.
36+
Only removes comparison_details.json and resets checkpoint to RUBRICS_GENERATED.
37+
38+
Returns:
39+
True if successfully cleared, False if no checkpoint exists
40+
"""
41+
output_path = Path(output_dir)
42+
checkpoint_file = output_path / "checkpoint.json"
43+
details_file = output_path / "comparison_details.json"
44+
45+
if not checkpoint_file.exists():
46+
logger.warning("No checkpoint found, nothing to clear")
47+
return False
48+
49+
# Remove comparison details
50+
if details_file.exists():
51+
details_file.unlink()
52+
logger.info(f"Removed {details_file}")
53+
54+
# Update checkpoint to RUBRICS_GENERATED stage
55+
with open(checkpoint_file, "r", encoding="utf-8") as f:
56+
checkpoint = json.load(f)
57+
58+
checkpoint["stage"] = "rubrics_generated"
59+
checkpoint["evaluated_pairs"] = 0
60+
checkpoint["total_pairs"] = 0
61+
62+
with open(checkpoint_file, "w", encoding="utf-8") as f:
63+
json.dump(checkpoint, f, indent=2, ensure_ascii=False)
64+
65+
logger.info("Reset checkpoint to RUBRICS_GENERATED stage (will re-run pairwise evaluation)")
66+
return True
67+
68+
3169
async def _run_evaluation(
3270
config_path: str,
3371
output_dir: Optional[str] = None,
@@ -67,6 +105,7 @@ def main(
67105
queries_file: Optional[str] = None,
68106
save: bool = False,
69107
fresh: bool = False,
108+
rerun_judge: bool = False,
70109
) -> None:
71110
"""Zero-shot evaluation CLI with checkpoint support.
72111
@@ -76,6 +115,8 @@ def main(
76115
queries_file: Path to pre-generated queries JSON (skip query generation)
77116
save: Whether to save results to file
78117
fresh: Start fresh, ignore any existing checkpoint
118+
rerun_judge: Re-run only pairwise evaluation with new judge model
119+
(keeps queries, responses, and rubrics)
79120
80121
Examples:
81122
# Normal run (auto-resumes from checkpoint)
@@ -86,6 +127,9 @@ def main(
86127
87128
# Start fresh, ignore checkpoint
88129
python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save
130+
131+
# Re-run with new judge model (keeps queries/responses/rubrics)
132+
python -m cookbooks.zero_shot_evaluation --config config.yaml --rerun-judge --save
89133
"""
90134
config_path = Path(config)
91135
if not config_path.exists():
@@ -98,15 +142,27 @@ def main(
98142
logger.error(f"Queries file not found: {queries_file}")
99143
return
100144

101-
logger.info(f"Starting zero-shot evaluation with config: {config}")
102-
if queries_file:
103-
logger.info(f"Using pre-generated queries from: {queries_file}")
104-
if fresh:
145+
# Load config to get output_dir
146+
loaded_config = load_config(str(config_path))
147+
effective_output_dir = output_dir or loaded_config.output.output_dir
148+
149+
# Handle rerun_judge and fresh flags
150+
if rerun_judge:
151+
if fresh:
152+
logger.warning("--rerun-judge and --fresh are mutually exclusive, using --rerun-judge")
153+
logger.info("Re-running pairwise evaluation with new judge model...")
154+
if not _clear_judge_results(effective_output_dir):
155+
logger.info("No previous results found, will run full evaluation")
156+
elif fresh:
105157
logger.info("Starting fresh (ignoring checkpoint)")
106158
else:
107159
logger.info("Resume mode enabled (will continue from checkpoint if exists)")
108160

109-
asyncio.run(_run_evaluation(str(config_path), output_dir, queries_file, save, resume=not fresh))
161+
logger.info(f"Starting zero-shot evaluation with config: {config}")
162+
if queries_file:
163+
logger.info(f"Using pre-generated queries from: {queries_file}")
164+
165+
asyncio.run(_run_evaluation(str(config_path), output_dir, queries_file, save, resume=not fresh or rerun_judge))
110166

111167

112168
if __name__ == "__main__":

0 commit comments

Comments
 (0)