@@ -96,16 +96,19 @@ def evaluate(flags):
9696 if flags .check_gt_only :
9797 # bypass the samples
9898 flags .samples = "__dummy__.jsonl"
99-
99+
100+ if flags .calibrate :
101+ assert "calibrate" in flags .samples , "Calibration is only supported for calibrated samples"
102+
100103 if os .path .isdir (flags .samples ):
101- if flags .reprompt :
102- result_path = os .path .join (flags .samples , "reprompt_eval_results .json" )
104+ if flags .calibrate :
105+ result_path = os .path .join (flags .samples , "calibrate_eval_results .json" )
103106 else :
104107 result_path = os .path .join (flags .samples , "eval_results.json" )
105108 else :
106109 assert flags .samples .endswith (".jsonl" )
107- if flags .reprompt :
108- result_path = flags .samples .replace (".jsonl" , "_reprompt_eval_results .json" )
110+ if flags .calibrate :
111+ result_path = flags .samples .replace (".jsonl" , "_calibrate_eval_results .json" )
109112 else :
110113 result_path = flags .samples .replace (".jsonl" , "_eval_results.json" )
111114
@@ -150,7 +153,7 @@ def evaluate(flags):
150153 if "solution" in sample
151154 else problems [task_id ]["prompt" ] + sample ["completion" ]
152155 )
153- if flags .reprompt :
156+ if flags .calibrate :
154157 solution = problems [task_id ]["prompt_wo_doc" ] + "\n pass\n " + solution
155158 remainings .add (sample ["_identifier" ])
156159 args = (
@@ -249,7 +252,7 @@ def main():
249252 parser .add_argument ("--parallel" , default = None , type = int )
250253 parser .add_argument ("--min-time-limit" , default = 1 , type = float )
251254 parser .add_argument (
252- "--reprompt " , action = "store_true" , help = "Prepend the prompt again "
255+ "--calibrate " , action = "store_true" , help = "Calibrate the evaluation results "
253256 )
254257 parser .add_argument (
255258 "--check-gt-only" , action = "store_true" , help = "Check the groundtruth"
0 commit comments