1+ import copy
12import time
23import shutil
34import subprocess
1011from scicode .parse .parse import extract_function_name , get_function_from_code
1112from scicode .gen .models import generate_dummy_response , extract_python_script
1213
13- SAVE = True
14- TEMP_DIR = "./tmp"
15- MODEL_NAME = "gpt-4o"
16- WITH_BACKGROUND = False
17- H5PY_FILE = "/eagle/tpc/zilinghan/SciCode/eval/data/test_data.h5"
18- BACKGOUND_PROMPT_TEMPLATE = Path ("data" , "multistep_template.txt" ).read_text ()
19- DEFAULT_PROMPT_TEMPLATE = Path ("data" , "background_comment_template.txt" ).read_text ()
20- SCICODE_DATA_JSON_PATH = "/eagle/tpc/zilinghan/SciCode/integration/inspection_ai/data/problems_all_new.json"
21- # SCICODE_DATA_JSON_PATH = "/eagle/tpc/zilinghan/SciCode/integration/inspection_ai/data/problems_dev.json"
22- # SCICODE_DATA_JSON_PATH = "/eagle/tpc/zilinghan/SciCode/integration/inspection_ai/data/problems_all.json
14+ BACKGOUND_PROMPT_TEMPLATE = Path ("../data" , "multistep_template.txt" ).read_text ()
15+ DEFAULT_PROMPT_TEMPLATE = Path ("../data" , "background_comment_template.txt" ).read_text ()
2316
2417class ScicodePromptingAssistant :
2518 def __init__ (
@@ -141,11 +134,6 @@ def prepare_final_prompt_with_steps(
141134 save : bool = True
142135 ):
143136 prob_id = prob_data ["problem_id" ]
144- output_file_path = Path (
145- self .output_dir ,
146- self ._get_background_dir (),
147- f"{ prob_id } .{ num_steps } .py"
148- )
149137 if num_steps == 1 :
150138 self .previous_llm_code = [None ] * tot_steps
151139 else :
@@ -159,7 +147,7 @@ def prepare_final_prompt_with_steps(
159147 (prob_id == "76" and prev_step == 2 )
160148 ):
161149 prev_file_path = Path (
162- "data" ,
150+ "../ data" ,
163151 f"{ prob_id } .{ prev_step + 1 } .txt"
164152 )
165153 else :
@@ -224,6 +212,12 @@ def test_code(
224212 sub_steps = prob_data ["sub_steps" ]
225213 problem_id = prob_data ["problem_id" ]
226214 for idx in range (len (sub_steps )):
215+ if (
216+ (problem_id == "13" and idx == 5 ) or
217+ (problem_id == "62" and idx == 0 ) or
218+ (problem_id == "76" and idx == 2 )
219+ ):
220+ continue
227221 step_id = sub_steps [idx ]["step_number" ]
228222 code_file_path = Path (code_dir , f"{ step_id } .py" )
229223 assert code_file_path .is_file (), f"Code file { code_file_path } not found."
@@ -249,11 +243,8 @@ def run_script(script_path):
249243 text = True , timeout = 1800 )
250244 return 0
251245 except subprocess .CalledProcessError as e :
252- print (f"Error running script { script_path } : { e } " )
253- print (e .output )
254246 return 1
255247 except subprocess .TimeoutExpired as e :
256- print (f"Runtime error while running script { script_path } : { e } " )
257248 return 2
258249
259250 total_steps = len (sub_steps )
@@ -288,7 +279,6 @@ def run_script(script_path):
288279 with open (logs_file , 'w' ) as f :
289280 f .write ('pass' )
290281 total_correct += 1
291- print (f"Problem { problem_id } step { idx + 1 } passed." )
292282 elif ret == 1 :
293283 with open (logs_file , 'w' ) as f :
294284 f .write ('fail' )
@@ -310,23 +300,18 @@ def record_to_sample(record):
310300 }
311301 )
312302
313- dataset = json_dataset (
314- SCICODE_DATA_JSON_PATH ,
315- record_to_sample
316- )
317-
303+ def generate_gold_response (prob_data : dict , num_steps : int ):
304+ return f"Blah blah\n ```python\n { prob_data ['sub_steps' ][num_steps - 1 ]['ground_truth_code' ]} \n ```\n "
318305
319306@solver
320307def scicode_solver (** params : dict [str , Any ]):
321308 async def solve (state : TaskState , generate : Generate ) -> TaskState :
322309 prompt_assistant = ScicodePromptingAssistant (
323- output_dir = Path (TEMP_DIR , "generated_code" ),
324- prompt_dir = Path (TEMP_DIR , "prompt" ),
325- with_background = WITH_BACKGROUND ,
310+ output_dir = Path (params [ "output_dir" ] , "generated_code" ),
311+ prompt_dir = Path (params [ "output_dir" ] , "prompt" ),
312+ with_background = params [ "with_background" ] ,
326313 )
327- prompt_template = BACKGOUND_PROMPT_TEMPLATE if WITH_BACKGROUND else DEFAULT_PROMPT_TEMPLATE
328- print ('===============================' )
329- print (f'Processing problem { state .sample_id } ' )
314+ prompt_template = BACKGOUND_PROMPT_TEMPLATE if params ["with_background" ] else DEFAULT_PROMPT_TEMPLATE
330315 sub_steps = state .metadata ["sub_steps" ]
331316 for idx in range (len (sub_steps )):
332317 prob_id = state .metadata ["problem_id" ]
@@ -342,16 +327,24 @@ async def solve(state: TaskState, generate: Generate) -> TaskState:
342327 tot_steps = len (sub_steps ),
343328 prompt_template = prompt_template ,
344329 )
345- response_from_llm = generate_dummy_response (prompt )
330+ if params ["mode" ] == "dummy" :
331+ response_from_llm = generate_dummy_response (prompt )
332+ elif params ["mode" ] == "gold" :
333+ response_from_llm = generate_gold_response (state .metadata , idx + 1 )
334+ else :
335+ # ===Model Generation===
336+ state .user_prompt .text = prompt
337+ state_copy = copy .deepcopy (state )
338+ result = await generate (state = state_copy )
339+ response_from_llm = result .output .completion
340+ # ===Model Generation===
346341 prompt_assistant .register_previous_response (
347342 prob_data = state .metadata ,
348343 response = response_from_llm ,
349344 previous_code = previous_code ,
350345 num_steps = idx + 1 ,
351346 )
352- print ('===============================' )
353347 return state
354-
355348 return solve
356349
357350@metric
@@ -362,7 +355,6 @@ def metric(scores: list[Score]) -> int | float:
362355 for score in scores :
363356 total_correct += score .value ["Total Correct" ]
364357 total_steps += score .value ["Total Steps" ]
365- print (f"Total correct: { total_correct } , Total steps: { total_steps } " )
366358 return total_correct / total_steps
367359 return metric
368360
@@ -371,13 +363,13 @@ def metric(scores: list[Score]) -> int | float:
371363 "Problem Correctness" : [mean ()],
372364 }, sub_problem_correctness ()]
373365)
374- def test_scorer ( ):
366+ def scicode_scorer ( ** params : dict [ str , Any ] ):
375367 async def score (state : TaskState , target : Target ):
376368 evaluator = ScicodeEvaluator (
377- h5py_file = H5PY_FILE ,
378- code_dir = TEMP_DIR ,
379- log_dir = TEMP_DIR ,
380- with_background = WITH_BACKGROUND ,
369+ h5py_file = params [ "h5py_file" ] ,
370+ code_dir = params [ "output_dir" ] ,
371+ log_dir = params [ "output_dir" ] ,
372+ with_background = params [ "with_background" ] ,
381373 )
382374 problem_correct , total_correct , total_steps = evaluator .test_code (state .metadata )
383375 return Score (
@@ -390,9 +382,29 @@ async def score(state: TaskState, target: Target):
390382 return score
391383
392384@task
393- def dummy_task ():
385+ def scicode (
386+ input_path : str = '../data/problems_all.jsonl' ,
387+ output_dir : str = './tmp' ,
388+ with_background : bool = False ,
389+ h5py_file : str = '../data/test_data.h5' ,
390+ mode : str = 'normal' ,
391+ ):
392+ dataset = json_dataset (
393+ input_path ,
394+ record_to_sample
395+ )
394396 return Task (
395397 dataset = dataset ,
396- solver = scicode_solver (),
397- scorer = test_scorer (),
398+ solver = scicode_solver (
399+ input_path = input_path ,
400+ output_dir = output_dir ,
401+ with_background = with_background ,
402+ mode = mode ,
403+ ),
404+ scorer = scicode_scorer (
405+ input_path = input_path ,
406+ output_dir = output_dir ,
407+ with_background = with_background ,
408+ h5py_file = h5py_file ,
409+ ),
398410 )
0 commit comments