Skip to content

Commit 1dc2983

Browse files
committed
Moved scripts/lm_eval/ to examples/lm_eval/
1 parent cc9b6db commit 1dc2983

File tree

13 files changed

+30
-20
lines changed

13 files changed

+30
-20
lines changed

.gitignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
results/
2-
scripts/lm_eval/prompts/system_message.txt
3-
scripts/lm_eval/prompts/evaluator_system_message.txt
2+
examples/lm_eval/prompts/system_message.txt
3+
examples/lm_eval/prompts/evaluator_system_message.txt
44

55
# Python
66
__pycache__/

scripts/lm_eval/README.md renamed to examples/lm_eval/README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
## Usage
88

99
```bash
10-
$ python3 scripts/lm_eval/lm-eval.py -h
10+
$ python3 examples/lm_eval/lm-eval.py -h
1111
usage: lm-eval.py [-h] [--config CONFIG] [--init_file INIT_FILE] [--evaluator_file EVALUATOR_FILE] [--iterations ITERATIONS] [--limit LIMIT] [--tasks TASKS]
1212
[--output_path OUTPUT_PATH]
1313

@@ -30,26 +30,26 @@ options:
3030

3131
Early examples that **were meant to** indicate that more evolution iterations improve task performance -- I suspect the prompting may not be ideal yet:
3232
```
33-
$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 1
33+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 1
3434
[..]
3535
Headline metrics:
3636
gsm8k exact_match,strict-match 80.000%
3737
[..]
3838
3939
40-
$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 3
40+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 3
4141
[..]
4242
Headline metrics:
4343
gsm8k exact_match,strict-match 90.000%
4444
[..]
4545
46-
$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 10
46+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 10
4747
[..]
4848
Headline metrics:
4949
gsm8k exact_match,strict-match 80.000%
5050
[..]
5151
52-
$ python3 scripts/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 15
52+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 15
5353
[..]
5454
Headline metrics:
5555
gsm8k exact_match,strict-match 70.000%

scripts/lm_eval/config.yml renamed to examples/lm_eval/config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ prompt:
2323
num_top_programs: 3
2424
use_template_stochasticity: true
2525
# System prompt is created dynamically during the benchmark in file system_message.txt!
26-
template_dir: "scripts/lm_eval/prompts"
26+
template_dir: "examples/lm_eval/prompts"
2727

2828
# Database configuration
2929
database:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
def evaluate_stage1(file_path):
22
return {"not_implemented": 0.0}
33

4+
45
def evaluate(file_path):
56
return evaluate_stage1(file_path)

scripts/lm_eval/lm-eval.py renamed to examples/lm_eval/lm-eval.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
PIPELINE_CMD = ["python3", "openevolve-run.py"]
2525

26+
2627
@register_model("openevolve")
2728
class OpenEvolve(LM):
2829
def __init__(
@@ -42,9 +43,9 @@ def __init__(
4243
self.config_file = config_file
4344

4445
# folder must match prompt:template_dir in config.yml!
45-
self.prompt_path = "scripts/lm_eval/prompts/system_message.txt"
46-
self.evaluator_prompt_path = "scripts/lm_eval/prompts/evaluator_system_message.txt"
47-
self.best_path = "scripts/lm_eval/openevolve_output/best/best_program.txt"
46+
self.prompt_path = "examples/lm_eval/prompts/system_message.txt"
47+
self.evaluator_prompt_path = "examples/lm_eval/prompts/evaluator_system_message.txt"
48+
self.best_path = "examples/lm_eval/openevolve_output/best/best_program.txt"
4849
self.base_system_message = "You are an expert task solver, with a lot of commonsense, math, language and coding knowledge.\n\nConsider this task:\n```{prompt}´´´"
4950

5051
def generate(self, prompts: List[str], max_gen_toks: int = None, stop=None, **kwargs):
@@ -133,22 +134,28 @@ def generate_until(self, requests: Iterable[Any], **kw) -> List[str]:
133134
cleaned.append(g)
134135
return cleaned
135136

137+
136138
if __name__ == "__main__":
137139
# cli arguments for primary model, secondary model, iterations, config and tasks
138140
p = argparse.ArgumentParser(
139141
description="OpenEvolve <-> lm-evaluation-harness adapter.",
140142
)
141-
p.add_argument("--config", default="scripts/lm_eval/config.yml", help="config file")
143+
p.add_argument("--config", default="examples/lm_eval/config.yml", help="config file")
142144
p.add_argument(
143145
"--init_file",
144-
default="scripts/lm_eval/initial_content_stub.txt",
146+
default="examples/lm_eval/initial_content_stub.txt",
145147
help="initial content file",
146148
)
147149
p.add_argument(
148-
"--evaluator_file", default="scripts/lm_eval/evaluator_stub.py", help="evaluator file"
150+
"--evaluator_file", default="examples/lm_eval/evaluator_stub.py", help="evaluator file"
149151
)
150152
p.add_argument("--iterations", default=5, type=int, help="number of iterations")
151-
p.add_argument("--limit", default=None, type=int, help="limit the number of examples per task that are executed")
153+
p.add_argument(
154+
"--limit",
155+
default=None,
156+
type=int,
157+
help="limit the number of examples per task that are executed",
158+
)
152159
# p.add_argument("--tasks", default="boolq,gsm8k,mmlu", help="comma-list of tasks to evaluate")
153160
p.add_argument("--tasks", default="gsm8k", help="list of tasks to evaluate")
154161
p.add_argument("--output_path", default="results", help="output path for results")
@@ -175,10 +182,12 @@ def generate_until(self, requests: Iterable[Any], **kw) -> List[str]:
175182
).mkdir(exist_ok=True)
176183

177184
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
178-
results_path = pathlib.Path(os.path.join(
179-
args.output_path,
180-
f"{timestamp}_iter{args.iterations}.json",
181-
))
185+
results_path = pathlib.Path(
186+
os.path.join(
187+
args.output_path,
188+
f"{timestamp}_iter{args.iterations}.json",
189+
)
190+
)
182191

183192
with results_path.open("w") as f:
184193
json.dump(results, f, indent=2)
@@ -189,7 +198,7 @@ def generate_until(self, requests: Iterable[Any], **kw) -> List[str]:
189198
# pick the first value that is a real number
190199
for key, val in metrics.items():
191200
if isinstance(val, (int, float)):
192-
short[task] = (key, val) # store *both* name & value
201+
short[task] = (key, val) # store *both* name & value
193202
break
194203

195204
print(f"Full results written to {results_path}\n")
File renamed without changes.
File renamed without changes.

0 commit comments

Comments
 (0)