scicode-bench
diff --git a/‎eval/data/problems_all.jsonl‎
Lines changed: 65 additions & 65 deletions b/‎eval/data/problems_all.jsonl‎
Lines changed: 65 additions & 65 deletions
diff --git a/‎eval/scripts/gencode_json.py‎
Lines changed: 25 additions & 10 deletions b/‎eval/scripts/gencode_json.py‎
Lines changed: 25 additions & 10 deletions
diff --git a/‎eval/scripts/test_generated_code.py‎
Lines changed: 23 additions & 8 deletions b/‎eval/scripts/test_generated_code.py‎
Lines changed: 23 additions & 8 deletions
diff --git a/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.1.txt‎
Lines changed: 0 additions & 1 deletion b/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.1.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.10.txt‎
Lines changed: 0 additions & 1 deletion b/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.10.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.11.txt‎
Lines changed: 0 additions & 1 deletion b/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.11.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.12.txt‎
Lines changed: 0 additions & 1 deletion b/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.12.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.2.txt‎
Lines changed: 0 additions & 1 deletion b/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.2.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.3.txt‎
Lines changed: 0 additions & 1 deletion b/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.3.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.4.txt‎
Lines changed: 0 additions & 1 deletion b/‎logs/litellm/together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/11.4.txt‎
Lines changed: 0 additions & 1 deletion
@@ -14,23 +14,27 @@
 
 class Gencode:
     def __init__(self, model: str, output_dir: Path,
-                 prompt_dir: Path, temperature: float):
+                 prompt_dir: Path, with_background: bool, temperature: float):
         self.model = model
         self.output_dir = output_dir
         self.prompt_dir = prompt_dir
+        self.with_background = with_background
         self.temperature = temperature
         self.previous_llm_code = []
 
-    def save_prompt_with_steps(self, prob_data: dict, prompt: str, num_steps: int, tot_steps: int) -> None:
-        output_dir = Path(self.prompt_dir, self.model)
+    def _get_background_dir(self):
+        return "with_background" if self.with_background else "without_background"
+
+    def save_prompt_with_steps(self, prob_data: dict, prompt: str, num_steps: int) -> None:
+        output_dir = Path(self.prompt_dir, Path(self.model).parts[-1], self._get_background_dir())
         output_dir.mkdir(parents=True, exist_ok=True)
         output_file_path = output_dir / f"{prob_data['problem_id']}.{num_steps}.txt"
         output_file_path.write_text(prompt, encoding="utf-8")
 
-    def save_response_with_steps(self, prob_data: dict, response: str, previous_code: str,
-                                 num_steps: int, model="gpt-4o",) -> None:
+    def save_response_with_steps(self, prob_data: dict, response: str,
+                                 previous_code: str, num_steps: int) -> None:
         output_dir = (
-                self.output_dir / model
+                self.output_dir / Path(self.model).parts[-1] / self._get_background_dir()
         )
         output_dir.mkdir(parents=True, exist_ok=True)
         prob_id = prob_data["problem_id"]
@@ -78,7 +82,7 @@ def generate_response_with_steps(
                         raise Exception(f'Generating {prob_id} step {num_steps} ahead of step {prev_step + 1}.')
         prompt, previous_code = self.generate_prompt_with_steps(prob_data, num_steps, prompt_template)
         if save:
-            self.save_prompt_with_steps(prob_data, prompt, num_steps, tot_steps)
+            self.save_prompt_with_steps(prob_data, prompt, num_steps)
 
         model_kwargs = {}
         if "claude" in model:
@@ -94,7 +98,7 @@ def generate_response_with_steps(
             model_fct = get_model_function(model, **model_kwargs)
             response_from_llm = model_fct(prompt)
             self.previous_llm_code[num_steps - 1] = extract_python_script(response_from_llm)
-            self.save_response_with_steps(prob_data, response_from_llm, previous_code, num_steps, model)
+            self.save_response_with_steps(prob_data, response_from_llm, previous_code, num_steps)
 
     @staticmethod
     def process_problem_code(prob_data: dict, num_steps: int) -> str:
@@ -109,11 +113,16 @@ def process_problem_steps(self, problem_data: dict, num_steps: int):
         next_step = []
         previous_code = []
         for i in range(num_steps - 1):
+            output_lines.append(problem_data["sub_steps"][i]["step_description_prompt"] + '\n' +
+                                problem_data["sub_steps"][i]["step_background"] if self.with_background
+                                else problem_data["sub_steps"][i]["step_description_prompt"])
             output_lines.append(self.previous_llm_code[i])
             previous_code.append(self.previous_llm_code[i])
             output_lines.append("------")
 
-        next_step.append(problem_data["sub_steps"][num_steps - 1]["step_description_prompt"])
+        next_step.append(problem_data["sub_steps"][num_steps - 1]["step_description_prompt"] + '\n' +
+                         problem_data["sub_steps"][num_steps - 1]["step_background"] if self.with_background
+                         else problem_data["sub_steps"][num_steps - 1]["step_description_prompt"])
         next_step.append(self.process_problem_code(problem_data, num_steps))
         output_str = "\n\n".join(output_lines[:-1])  # Remove the last "------"
         next_step_str = "\n\n".join(next_step)
@@ -160,6 +169,11 @@ def get_cli() -> argparse.ArgumentParser:
         default=Path("eval_results", "prompt"),
         help="Prompt directory",
     )
+    parser.add_argument(
+        "--with-background",
+        action="store_true",
+        help="Include problem background if enabled",
+    )
     parser.add_argument(
         "--temperature",
         type=float,
@@ -173,11 +187,12 @@ def main(model: str,
          output_dir: Path,
          input_path: Path,
          prompt_dir: Path,
+         with_background: bool,
          temperature: float
 ) -> None:
     gcode = Gencode(
         model=model, output_dir=output_dir,
-        prompt_dir=prompt_dir,  temperature=temperature
+        prompt_dir=prompt_dir,  with_background=with_background, temperature=temperature
     )
     data = read_from_jsonl(input_path)
     for problem in data:
 
@@ -6,6 +6,7 @@
 import numpy as np
 import argparse
 
+from scicode.parse.parse import H5PY_FILE
 from scicode.parse.parse import read_from_jsonl
 
 
@@ -15,7 +16,12 @@
 DEV_STEP_NUM = 50
 
 
-def test_code(model_name, code_dir, log_dir, output_dir, jsonl_path, dev_set=False):
+def _get_background_dir(with_background):
+    return "with_background" if with_background else "without_background"
+
+
+def test_code(model_name, code_dir, log_dir, output_dir,
+              jsonl_path, dev_set=False, with_background=False):
 
     jsonl_data = read_from_jsonl(jsonl_path)
     json_dct = {}
@@ -26,7 +32,7 @@ def test_code(model_name, code_dir, log_dir, output_dir, jsonl_path, dev_set=Fal
         json_idx[prob_data['problem_id']] = jsonl_data.index(prob_data)
     start_time = time.time()
 
-    code_dir_ = Path(code_dir, model_name)
+    code_dir_ = Path(code_dir, model_name, _get_background_dir(with_background))
     tmp_dir = Path(f'tmp_{start_time}')
 
     tmp_dir.mkdir(parents=True, exist_ok=True)
@@ -82,7 +88,7 @@ def run_script(script_path):
             prob_id = func_id.split('.')[0]
             print(f'Testing function {func_id} ...')
             tot_prob[int(prob_id) - 1] += 1
-            logs_dir_ = Path(log_dir, model_name)
+            logs_dir_ = Path(log_dir, model_name, _get_background_dir(with_background))
             logs_dir_.mkdir(parents=True, exist_ok=True)
             logs_file = Path(logs_dir_, f'{file_path.stem}.txt')
             if logs_file.exists():
@@ -116,16 +122,16 @@ def run_script(script_path):
     print(f'correct problems: {correct_prob_num}/{DEV_PROB_NUM if dev_set else PROB_NUM - DEV_PROB_NUM}')
     print(f'correct steps: {len(correct_step)}/{DEV_STEP_NUM if dev_set else STEP_NUM}')
 
-    Path(f'{output_dir}/{Path(model_name)}').mkdir(parents=True, exist_ok=True)
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
 
-    with open(f'{output_dir}/{model_name}.txt', 'w') as f:
+    with open(f'{output_dir}/{model_name}_{_get_background_dir(with_background)}.txt', 'w') as f:
         f.write(f'correct problems: {correct_prob_num}/{DEV_PROB_NUM if dev_set else PROB_NUM - DEV_PROB_NUM}\n')
         f.write(f'correct steps: {len(correct_step)}/{DEV_STEP_NUM if dev_set else STEP_NUM}\n\n')
         f.write(f'duration: {test_time} seconds\n')
         f.write('\ncorrect problems: ')
         f.write(f'\n\n{[i + 1 for i in range(PROB_NUM) if correct_prob[i] == tot_prob[i] and tot_prob[i] != 0]}\n')
 
-    with open(f'{output_dir}/{model_name}.json', 'w', encoding='utf-8') as f:
+    with open(f'{output_dir}/{model_name}_{_get_background_dir(with_background)}.json', 'w', encoding='utf-8') as f:
         json.dump(correct_dict, f, indent=4)
 
     shutil.rmtree(tmp_dir)
@@ -166,6 +172,11 @@ def get_cli() -> argparse.ArgumentParser:
         "--dev-set",
         action='store_true',
         help="Test dev set if enabled",
+    ),
+    parser.add_argument(
+        "--with-background",
+        action="store_true",
+        help="Include problem background if enabled",
     )
     return parser
 
@@ -175,9 +186,13 @@ def main(model: str,
          log_dir: Path,
          output_dir: Path,
          jsonl_path: Path,
-         dev_set: bool
+         dev_set: bool,
+         with_background: bool
 ) -> None:
-    test_code(model, code_dir, log_dir, output_dir, jsonl_path, dev_set)
+    if not Path(H5PY_FILE).exists():
+        raise FileNotFoundError("Please download the numeric test results before testing generated code.")
+    model = Path(model).parts[-1]
+    test_code(model, code_dir, log_dir, output_dir, jsonl_path, dev_set, with_background)
 
 
 if __name__ == "__main__":