Skip to content

Commit 08b339a

Browse files
committed
finish scicode inspection_ai integration
1 parent 556ffde commit 08b339a

File tree

13 files changed

+57
-41076
lines changed

13 files changed

+57
-41076
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
keys.cfg
33
**/test_result/**
44
**/output/**
5-
**/eval_results/**
5+
**/eval_results*/**
66
eval/logs/**
77
*.h5
88
logs/**
99
**/logs/**
1010
**/tmp/**
11+
integration/**
1112

1213
# -------
1314

integration/inspection_ai/scicode.py renamed to eval/inspection_ai/scicode.py

Lines changed: 55 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import copy
12
import time
23
import shutil
34
import subprocess
@@ -10,16 +11,8 @@
1011
from scicode.parse.parse import extract_function_name, get_function_from_code
1112
from scicode.gen.models import generate_dummy_response, extract_python_script
1213

13-
SAVE = True
14-
TEMP_DIR = "./tmp"
15-
MODEL_NAME = "gpt-4o"
16-
WITH_BACKGROUND = False
17-
H5PY_FILE = "/eagle/tpc/zilinghan/SciCode/eval/data/test_data.h5"
18-
BACKGOUND_PROMPT_TEMPLATE = Path("data", "multistep_template.txt").read_text()
19-
DEFAULT_PROMPT_TEMPLATE = Path("data", "background_comment_template.txt").read_text()
20-
SCICODE_DATA_JSON_PATH = "/eagle/tpc/zilinghan/SciCode/integration/inspection_ai/data/problems_all_new.json"
21-
# SCICODE_DATA_JSON_PATH = "/eagle/tpc/zilinghan/SciCode/integration/inspection_ai/data/problems_dev.json"
22-
# SCICODE_DATA_JSON_PATH = "/eagle/tpc/zilinghan/SciCode/integration/inspection_ai/data/problems_all.json
14+
BACKGOUND_PROMPT_TEMPLATE = Path("../data", "multistep_template.txt").read_text()
15+
DEFAULT_PROMPT_TEMPLATE = Path("../data", "background_comment_template.txt").read_text()
2316

2417
class ScicodePromptingAssistant:
2518
def __init__(
@@ -141,11 +134,6 @@ def prepare_final_prompt_with_steps(
141134
save: bool = True
142135
):
143136
prob_id = prob_data["problem_id"]
144-
output_file_path = Path(
145-
self.output_dir,
146-
self._get_background_dir(),
147-
f"{prob_id}.{num_steps}.py"
148-
)
149137
if num_steps == 1:
150138
self.previous_llm_code = [None] * tot_steps
151139
else:
@@ -159,7 +147,7 @@ def prepare_final_prompt_with_steps(
159147
(prob_id == "76" and prev_step == 2)
160148
):
161149
prev_file_path = Path(
162-
"data",
150+
"../data",
163151
f"{prob_id}.{prev_step+1}.txt"
164152
)
165153
else:
@@ -224,6 +212,12 @@ def test_code(
224212
sub_steps = prob_data["sub_steps"]
225213
problem_id = prob_data["problem_id"]
226214
for idx in range(len(sub_steps)):
215+
if (
216+
(problem_id == "13" and idx == 5) or
217+
(problem_id == "62" and idx == 0) or
218+
(problem_id == "76" and idx == 2)
219+
):
220+
continue
227221
step_id = sub_steps[idx]["step_number"]
228222
code_file_path = Path(code_dir, f"{step_id}.py")
229223
assert code_file_path.is_file(), f"Code file {code_file_path} not found."
@@ -249,11 +243,8 @@ def run_script(script_path):
249243
text=True, timeout=1800)
250244
return 0
251245
except subprocess.CalledProcessError as e:
252-
print(f"Error running script {script_path}: {e}")
253-
print(e.output)
254246
return 1
255247
except subprocess.TimeoutExpired as e:
256-
print(f"Runtime error while running script {script_path}: {e}")
257248
return 2
258249

259250
total_steps = len(sub_steps)
@@ -288,7 +279,6 @@ def run_script(script_path):
288279
with open(logs_file, 'w') as f:
289280
f.write('pass')
290281
total_correct += 1
291-
print(f"Problem {problem_id} step {idx + 1} passed.")
292282
elif ret == 1:
293283
with open(logs_file, 'w') as f:
294284
f.write('fail')
@@ -310,23 +300,18 @@ def record_to_sample(record):
310300
}
311301
)
312302

313-
dataset = json_dataset(
314-
SCICODE_DATA_JSON_PATH,
315-
record_to_sample
316-
)
317-
303+
def generate_gold_response(prob_data: dict, num_steps: int):
304+
return f"Blah blah\n```python\n{prob_data['sub_steps'][num_steps - 1]['ground_truth_code']}\n```\n"
318305

319306
@solver
320307
def scicode_solver(**params: dict[str, Any]):
321308
async def solve(state: TaskState, generate: Generate) -> TaskState:
322309
prompt_assistant = ScicodePromptingAssistant(
323-
output_dir=Path(TEMP_DIR, "generated_code"),
324-
prompt_dir=Path(TEMP_DIR, "prompt"),
325-
with_background=WITH_BACKGROUND,
310+
output_dir=Path(params["output_dir"], "generated_code"),
311+
prompt_dir=Path(params["output_dir"], "prompt"),
312+
with_background=params["with_background"],
326313
)
327-
prompt_template = BACKGOUND_PROMPT_TEMPLATE if WITH_BACKGROUND else DEFAULT_PROMPT_TEMPLATE
328-
print('===============================')
329-
print(f'Processing problem {state.sample_id}')
314+
prompt_template = BACKGOUND_PROMPT_TEMPLATE if params["with_background"] else DEFAULT_PROMPT_TEMPLATE
330315
sub_steps = state.metadata["sub_steps"]
331316
for idx in range(len(sub_steps)):
332317
prob_id = state.metadata["problem_id"]
@@ -342,16 +327,24 @@ async def solve(state: TaskState, generate: Generate) -> TaskState:
342327
tot_steps=len(sub_steps),
343328
prompt_template=prompt_template,
344329
)
345-
response_from_llm = generate_dummy_response(prompt)
330+
if params["mode"] == "dummy":
331+
response_from_llm = generate_dummy_response(prompt)
332+
elif params["mode"] == "gold":
333+
response_from_llm = generate_gold_response(state.metadata, idx+1)
334+
else:
335+
# ===Model Generation===
336+
state.user_prompt.text = prompt
337+
state_copy = copy.deepcopy(state)
338+
result = await generate(state=state_copy)
339+
response_from_llm = result.output.completion
340+
# ===Model Generation===
346341
prompt_assistant.register_previous_response(
347342
prob_data=state.metadata,
348343
response=response_from_llm,
349344
previous_code=previous_code,
350345
num_steps=idx+1,
351346
)
352-
print('===============================')
353347
return state
354-
355348
return solve
356349

357350
@metric
@@ -362,7 +355,6 @@ def metric(scores: list[Score]) -> int | float:
362355
for score in scores:
363356
total_correct += score.value["Total Correct"]
364357
total_steps += score.value["Total Steps"]
365-
print(f"Total correct: {total_correct}, Total steps: {total_steps}")
366358
return total_correct / total_steps
367359
return metric
368360

@@ -371,13 +363,13 @@ def metric(scores: list[Score]) -> int | float:
371363
"Problem Correctness": [mean()],
372364
}, sub_problem_correctness()]
373365
)
374-
def test_scorer():
366+
def scicode_scorer(**params: dict[str, Any]):
375367
async def score(state: TaskState, target: Target):
376368
evaluator = ScicodeEvaluator(
377-
h5py_file=H5PY_FILE,
378-
code_dir=TEMP_DIR,
379-
log_dir=TEMP_DIR,
380-
with_background=WITH_BACKGROUND,
369+
h5py_file=params["h5py_file"],
370+
code_dir=params["output_dir"],
371+
log_dir=params["output_dir"],
372+
with_background=params["with_background"],
381373
)
382374
problem_correct, total_correct, total_steps = evaluator.test_code(state.metadata)
383375
return Score(
@@ -390,9 +382,29 @@ async def score(state: TaskState, target: Target):
390382
return score
391383

392384
@task
393-
def dummy_task():
385+
def scicode(
386+
input_path: str = '../data/problems_all.jsonl',
387+
output_dir: str = './tmp',
388+
with_background: bool = False,
389+
h5py_file: str = '../data/test_data.h5',
390+
mode: str = 'normal',
391+
):
392+
dataset = json_dataset(
393+
input_path,
394+
record_to_sample
395+
)
394396
return Task(
395397
dataset=dataset,
396-
solver=scicode_solver(),
397-
scorer=test_scorer(),
398+
solver=scicode_solver(
399+
input_path=input_path,
400+
output_dir=output_dir,
401+
with_background=with_background,
402+
mode=mode,
403+
),
404+
scorer=scicode_scorer(
405+
input_path=input_path,
406+
output_dir=output_dir,
407+
with_background=with_background,
408+
h5py_file=h5py_file,
409+
),
398410
)

integration/inspection_ai/data/13.6.txt

Lines changed: 0 additions & 50 deletions
This file was deleted.

integration/inspection_ai/data/62.1.txt

Lines changed: 0 additions & 29 deletions
This file was deleted.

integration/inspection_ai/data/76.3.txt

Lines changed: 0 additions & 27 deletions
This file was deleted.

integration/inspection_ai/data/background_comment_template.txt

Lines changed: 0 additions & 28 deletions
This file was deleted.

integration/inspection_ai/data/multistep_template.txt

Lines changed: 0 additions & 23 deletions
This file was deleted.

0 commit comments

Comments
 (0)