Skip to content

Commit db950db

Browse files
committed
build: ensure llm-judge cache is populated
1 parent 2bc2256 commit db950db

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

leaderboard-submissions/hydrate.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,17 +162,17 @@ async def eval_submission(metadata_fp: Path, check_result: CheckResult):
162162

163163
print("Evaluating closed book answers...")
164164
closedbook_answers = read_jsonl_answers(CB_PATH / check_result.metadata.closedbook_generations)
165-
closedbook_scorer = Scorer(questions, closedbook_answers)
165+
closedbook_scorer = Scorer(questions, closedbook_answers, llm_cache_key="eval")
166166
closedbook_results = (await closedbook_scorer.score()).to_dict()
167167

168168
print("Evaluating open book answers...")
169169
openbook_answers = read_jsonl_answers(OB_PATH / check_result.metadata.openbook_generations)
170-
openbook_scorer = Scorer(questions, openbook_answers)
170+
openbook_scorer = Scorer(questions, openbook_answers, llm_cache_key="eval")
171171
openbook_results = (await openbook_scorer.score()).to_dict()
172172

173173
print("Evaluating evidence provided answers...")
174174
evidenceprovided_answers = read_jsonl_answers(EP_PATH / check_result.metadata.evidenceprovided_generations)
175-
evidenceprovided_scorer = Scorer(questions, evidenceprovided_answers)
175+
evidenceprovided_scorer = Scorer(questions, evidenceprovided_answers, llm_cache_key="eval")
176176
evidenceprovided_results = (await evidenceprovided_scorer.score()).to_dict()
177177

178178
# hash the results to prevent score manipulation

0 commit comments

Comments
 (0)