@@ -95,20 +95,11 @@ def evaluate(flags):
9595 # bypass the samples
9696 flags .samples = "__dummy__.jsonl"
9797
98- if flags .calibrate :
99- assert "calibrate" in flags .samples , "Calibration is only supported for calibrated samples"
100-
10198 if os .path .isdir (flags .samples ):
102- if flags .calibrate :
103- result_path = os .path .join (flags .samples , "calibrate_eval_results.json" )
104- else :
105- result_path = os .path .join (flags .samples , "eval_results.json" )
99+ result_path = os .path .join (flags .samples , "eval_results.json" )
106100 else :
107101 assert flags .samples .endswith (".jsonl" )
108- if flags .calibrate :
109- result_path = flags .samples .replace (".jsonl" , "_calibrate_eval_results.json" )
110- else :
111- result_path = flags .samples .replace (".jsonl" , "_eval_results.json" )
102+ result_path = flags .samples .replace (".jsonl" , "_eval_results.json" )
112103
113104 if os .path .isfile (result_path ):
114105 print (f"Load from previous results from { result_path } " )
@@ -153,7 +144,7 @@ def evaluate(flags):
153144 if "solution" in sample
154145 else problems [task_id ]["prompt" ] + sample ["completion" ]
155146 )
156- if flags .calibrate :
147+ if "sanitized-calibrate" in flags .samples :
157148 solution = problems [task_id ]["prompt_wo_doc" ] + "\n pass\n " + solution
158149 remainings .add (sample ["_identifier" ])
159150 args = (
@@ -250,9 +241,6 @@ def main():
250241 parser .add_argument ("--samples" , required = True , type = str )
251242 parser .add_argument ("--parallel" , default = None , type = int )
252243 parser .add_argument ("--min-time-limit" , default = 1 , type = float )
253- parser .add_argument (
254- "--calibrate" , action = "store_true" , help = "Calibrate the evaluation results"
255- )
256244 parser .add_argument (
257245 "--check-gt-only" , action = "store_true" , help = "Check the groundtruth"
258246 )
0 commit comments