algorithmicsuperintelligence
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 2 deletions b/‎.gitignore‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/lm_eval/README.md‎ renamed to ‎examples/lm_eval/README.md‎
Lines changed: 5 additions & 5 deletions b/‎scripts/lm_eval/README.md‎ renamed to ‎examples/lm_eval/README.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎scripts/lm_eval/config.yml‎ renamed to ‎examples/lm_eval/config.yml‎
Lines changed: 1 addition & 1 deletion b/‎scripts/lm_eval/config.yml‎ renamed to ‎examples/lm_eval/config.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/lm_eval/evaluator_stub.py‎ renamed to ‎examples/lm_eval/evaluator_stub.py‎
Lines changed: 1 addition & 0 deletions b/‎scripts/lm_eval/evaluator_stub.py‎ renamed to ‎examples/lm_eval/evaluator_stub.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/lm_eval/initial_content_stub.txt‎ renamed to ‎examples/lm_eval/initial_content_stub.txt‎ b/‎scripts/lm_eval/initial_content_stub.txt‎ renamed to ‎examples/lm_eval/initial_content_stub.txt‎
diff --git a/‎scripts/lm_eval/lm-eval.py‎ renamed to ‎examples/lm_eval/lm-eval.py‎
Lines changed: 21 additions & 12 deletions b/‎scripts/lm_eval/lm-eval.py‎ renamed to ‎examples/lm_eval/lm-eval.py‎
Lines changed: 21 additions & 12 deletions
diff --git a/‎scripts/lm_eval/prompts/diff_user.txt‎ renamed to ‎examples/lm_eval/prompts/diff_user.txt‎ b/‎scripts/lm_eval/prompts/diff_user.txt‎ renamed to ‎examples/lm_eval/prompts/diff_user.txt‎
diff --git a/‎scripts/lm_eval/prompts/evaluation.txt‎ renamed to ‎examples/lm_eval/prompts/evaluation.txt‎ b/‎scripts/lm_eval/prompts/evaluation.txt‎ renamed to ‎examples/lm_eval/prompts/evaluation.txt‎
diff --git a/‎scripts/lm_eval/prompts/evolution_history.txt‎ renamed to ‎examples/lm_eval/prompts/evolution_history.txt‎ b/‎scripts/lm_eval/prompts/evolution_history.txt‎ renamed to ‎examples/lm_eval/prompts/evolution_history.txt‎
diff --git a/‎scripts/lm_eval/prompts/full_rewrite_user.txt‎ renamed to ‎examples/lm_eval/prompts/full_rewrite_user.txt‎ b/‎scripts/lm_eval/prompts/full_rewrite_user.txt‎ renamed to ‎examples/lm_eval/prompts/full_rewrite_user.txt‎
@@ -1,6 +1,6 @@
 results/
-scripts/lm_eval/prompts/system_message.txt
-scripts/lm_eval/prompts/evaluator_system_message.txt
+examples/lm_eval/prompts/system_message.txt
+examples/lm_eval/prompts/evaluator_system_message.txt
 
 # Python
 __pycache__/
 
@@ -7,7 +7,7 @@
 ## Usage
 
 ```bash
-$ python3 scripts/lm_eval/lm-eval.py -h
+$ python3 examples/lm_eval/lm-eval.py -h
 usage: lm-eval.py [-h] [--config CONFIG] [--init_file INIT_FILE] [--evaluator_file EVALUATOR_FILE] [--iterations ITERATIONS] [--limit LIMIT] [--tasks TASKS]
                   [--output_path OUTPUT_PATH]
 
@@ -30,26 +30,26 @@ options:
 
 Early examples that **were meant to** indicate that more evolution iterations improve task performance -- I suspect the prompting may not be ideal yet:
 ```
-$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 1
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 1
 [..]
 Headline metrics:
   gsm8k           exact_match,strict-match 80.000%
 [..]
 
 
-$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 3
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 3
 [..]
 Headline metrics:
   gsm8k           exact_match,strict-match 90.000%
 [..]
 
-$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 10
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 10
 [..]
 Headline metrics:
   gsm8k           exact_match,strict-match 80.000%
 [..]
 
-$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 15
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 15
 [..]
 Headline metrics:
   gsm8k           exact_match,strict-match 70.000%
 
@@ -23,7 +23,7 @@ prompt:
   num_top_programs: 3
   use_template_stochasticity: true
   # System prompt is created dynamically during the benchmark in file system_message.txt!
-  template_dir: "scripts/lm_eval/prompts"
+  template_dir: "examples/lm_eval/prompts"
 
 # Database configuration
 database:
 
@@ -1,5 +1,6 @@
 def evaluate_stage1(file_path):
     return {"not_implemented": 0.0}
 
+
 def evaluate(file_path):
     return evaluate_stage1(file_path)
@@ -23,6 +23,7 @@
 
 PIPELINE_CMD = ["python3", "openevolve-run.py"]
 
+
 @register_model("openevolve")
 class OpenEvolve(LM):
     def __init__(
@@ -42,9 +43,9 @@ def __init__(
         self.config_file = config_file
 
         # folder must match prompt:template_dir in config.yml!
-        self.prompt_path = "scripts/lm_eval/prompts/system_message.txt"
-        self.evaluator_prompt_path = "scripts/lm_eval/prompts/evaluator_system_message.txt"
-        self.best_path = "scripts/lm_eval/openevolve_output/best/best_program.txt"
+        self.prompt_path = "examples/lm_eval/prompts/system_message.txt"
+        self.evaluator_prompt_path = "examples/lm_eval/prompts/evaluator_system_message.txt"
+        self.best_path = "examples/lm_eval/openevolve_output/best/best_program.txt"
         self.base_system_message = "You are an expert task solver, with a lot of commonsense, math, language and coding knowledge.\n\nConsider this task:\n```{prompt}´´´"
 
     def generate(self, prompts: List[str], max_gen_toks: int = None, stop=None, **kwargs):
@@ -133,22 +134,28 @@ def generate_until(self, requests: Iterable[Any], **kw) -> List[str]:
             cleaned.append(g)
         return cleaned
 
+
 if __name__ == "__main__":
     # cli arguments for primary model, secondary model, iterations, config and tasks
     p = argparse.ArgumentParser(
         description="OpenEvolve <-> lm-evaluation-harness adapter.",
     )
-    p.add_argument("--config", default="scripts/lm_eval/config.yml", help="config file")
+    p.add_argument("--config", default="examples/lm_eval/config.yml", help="config file")
     p.add_argument(
         "--init_file",
-        default="scripts/lm_eval/initial_content_stub.txt",
+        default="examples/lm_eval/initial_content_stub.txt",
         help="initial content file",
     )
     p.add_argument(
-        "--evaluator_file", default="scripts/lm_eval/evaluator_stub.py", help="evaluator file"
+        "--evaluator_file", default="examples/lm_eval/evaluator_stub.py", help="evaluator file"
     )
     p.add_argument("--iterations", default=5, type=int, help="number of iterations")
-    p.add_argument("--limit", default=None, type=int, help="limit the number of examples per task that are executed")
+    p.add_argument(
+        "--limit",
+        default=None,
+        type=int,
+        help="limit the number of examples per task that are executed",
+    )
     # p.add_argument("--tasks", default="boolq,gsm8k,mmlu", help="comma-list of tasks to evaluate")
     p.add_argument("--tasks", default="gsm8k", help="list of tasks to evaluate")
     p.add_argument("--output_path", default="results", help="output path for results")
@@ -175,10 +182,12 @@ def generate_until(self, requests: Iterable[Any], **kw) -> List[str]:
     ).mkdir(exist_ok=True)
 
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    results_path = pathlib.Path(os.path.join(
-        args.output_path,
-        f"{timestamp}_iter{args.iterations}.json",
-    ))
+    results_path = pathlib.Path(
+        os.path.join(
+            args.output_path,
+            f"{timestamp}_iter{args.iterations}.json",
+        )
+    )
 
     with results_path.open("w") as f:
         json.dump(results, f, indent=2)
@@ -189,7 +198,7 @@ def generate_until(self, requests: Iterable[Any], **kw) -> List[str]:
         # pick the first value that is a real number
         for key, val in metrics.items():
             if isinstance(val, (int, float)):
-                short[task] = (key, val)          # store *both* name & value
+                short[task] = (key, val)  # store *both* name & value
                 break
 
     print(f"Full results written to {results_path}\n")