Skip to content

Commit 7d9934b

Browse files
committed
Merge branch 'main' into feat/MLX-kernel-optimization
2 parents fd76977 + 4b099e3 commit 7d9934b

25 files changed

+748
-129
lines changed

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
results/
2+
examples/lm_eval/prompts/system_message.txt
3+
examples/lm_eval/prompts/evaluator_system_message.txt
4+
15
# Python
26
__pycache__/
37
*.py[cod]
@@ -48,4 +52,4 @@ htmlcov/
4852

4953
# For SR
5054
secrets.yaml
51-
problems
55+
problems

Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,9 @@ docker-build:
4848
# Run the Docker container with the example
4949
.PHONY: docker-run
5050
docker-run:
51-
docker run --rm -v $(PROJECT_DIR):/app $(DOCKER_IMAGE) examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
51+
docker run --rm -v $(PROJECT_DIR):/app --network="host" $(DOCKER_IMAGE) examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
52+
53+
# Run the lm-eval benchmark
54+
.PHONY: lm-eval
55+
lm-eval:
56+
$(PYTHON) scripts/lm_eval/lm-eval.py

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ cat checkpoints/checkpoint_*/best_program_info.json | grep -A 10 metrics
133133
You can also install and execute via Docker:
134134
```bash
135135
docker build -t openevolve .
136-
docker run --rm -v $(pwd):/app openevolve examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
136+
docker run --rm -v $(pwd):/app --network="host" openevolve examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
137137
```
138138

139139
## Configuration

configs/default_config.yaml

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,21 @@ max_code_length: 10000 # Maximum allowed code length in character
1616

1717
# LLM configuration
1818
llm:
19-
# Primary model (used most frequently)
20-
primary_model: "gemini-2.0-flash-lite"
21-
primary_model_weight: 0.8 # Sampling weight for primary model
22-
23-
# Secondary model (used for occasional high-quality generations)
24-
secondary_model: "gemini-2.0-flash"
25-
secondary_model_weight: 0.2 # Sampling weight for secondary model
19+
# Models for evolution
20+
models:
21+
# List of available models with their weights
22+
- name: "gemini-2.0-flash-lite"
23+
weight: 0.8
24+
- name: "gemini-2.0-flash"
25+
weight: 0.2
26+
27+
# Models for LLM feedback
28+
evaluator_models:
29+
# List of available models with their weights
30+
- name: "gemini-2.0-flash-lite"
31+
weight: 0.8
32+
- name: "gemini-2.0-flash"
33+
weight: 0.2
2634

2735
# API configuration
2836
api_base: "https://generativelanguage.googleapis.com/v1beta/openai/" # Base URL for API (change for non-OpenAI models)
@@ -42,6 +50,7 @@ llm:
4250
prompt:
4351
template_dir: null # Custom directory for prompt templates
4452
system_message: "You are an expert coder helping to improve programs through evolution."
53+
evaluator_system_message: "You are an expert code reviewer."
4554

4655
# Number of examples to include in the prompt
4756
num_top_programs: 3 # Number of top-performing programs to include

examples/lm_eval/README.md

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# lm-eval.py
2+
3+
`lm-eval.py` provides basic benchmark capability for LLM feedback-based evolutionary task solving. The benchmark framework is [EleutherAI's lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
4+
5+
*Limitation:* Only generation-only tasks such as gsm8k are supported. This is because tasks that require loglikelihood probabilities are not well applicable to agents.
6+
7+
## Usage
8+
9+
```bash
10+
$ python3 examples/lm_eval/lm-eval.py -h
11+
usage: lm-eval.py [-h] [--config CONFIG] [--init_file INIT_FILE] [--evaluator_file EVALUATOR_FILE] [--iterations ITERATIONS] [--limit LIMIT] [--tasks TASKS]
12+
[--output_path OUTPUT_PATH]
13+
14+
OpenEvolve <-> lm-evaluation-harness adapter.
15+
16+
options:
17+
-h, --help show this help message and exit
18+
--config CONFIG config file
19+
--init_file INIT_FILE
20+
initial content file
21+
--evaluator_file EVALUATOR_FILE
22+
evaluator file
23+
--iterations ITERATIONS
24+
number of iterations
25+
--limit LIMIT limit the number of examples per task that are executed
26+
--tasks TASKS list of tasks to evaluate
27+
--output_path OUTPUT_PATH
28+
output path for results
29+
```
30+
31+
Early examples that **were meant to** indicate that more evolution iterations improve task performance -- I suspect the prompting may not be ideal yet:
32+
```
33+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 1
34+
[..]
35+
Headline metrics:
36+
gsm8k exact_match,strict-match 80.000%
37+
[..]
38+
39+
40+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 3
41+
[..]
42+
Headline metrics:
43+
gsm8k exact_match,strict-match 90.000%
44+
[..]
45+
46+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 10
47+
[..]
48+
Headline metrics:
49+
gsm8k exact_match,strict-match 80.000%
50+
[..]
51+
52+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 15
53+
[..]
54+
Headline metrics:
55+
gsm8k exact_match,strict-match 70.000%
56+
[..]
57+
```
58+
59+
## Warning
60+
61+
- Be aware that this is an early implementation. No extensive benchmarks have been executed so far. With a limit to 10 tasks and 10 iterations, the benchmark is meaningless as is.
62+
- Use the --limit parameter only for tests, not for metric generation.
63+
- Do not cite the metrics that result from the script execution blindly without reviewing the solution first.
64+
65+
## References
66+
67+
```bibtex
68+
@misc{eval-harness,
69+
author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
70+
title = {The Language Model Evaluation Harness},
71+
month = 07,
72+
year = 2024,
73+
publisher = {Zenodo},
74+
version = {v0.4.3},
75+
doi = {10.5281/zenodo.12608602},
76+
url = {https://zenodo.org/records/12608602}
77+
}
78+
```

examples/lm_eval/config.yml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
max_iterations: 1
2+
checkpoint_interval: 10
3+
log_level: "INFO"
4+
5+
# LLM configuration
6+
llm:
7+
primary_model: "gemma3:12b-it-qat"
8+
#primary_model: "gpt-4o"
9+
primary_model_weight: 0.8
10+
secondary_model: "gemma3:12b-it-qat"
11+
#secondary_model: "gpt-4.1"
12+
secondary_model_weight: 0.2
13+
# api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
14+
# api_base: "https://api.openai.com/v1/"
15+
api_base: "http://localhost:11434/v1/"
16+
api_key: "ollama"
17+
temperature: 0.7
18+
top_p: 0.95
19+
max_tokens: 4096
20+
21+
# Prompt configuration
22+
prompt:
23+
num_top_programs: 3
24+
use_template_stochasticity: true
25+
# System prompt is created dynamically during the benchmark in file system_message.txt!
26+
template_dir: "examples/lm_eval/prompts"
27+
28+
# Database configuration
29+
database:
30+
population_size: 50
31+
archive_size: 20
32+
num_islands: 3
33+
elite_selection_ratio: 0.2
34+
exploitation_ratio: 0.7
35+
36+
# Evaluator configuration
37+
evaluator:
38+
timeout: 60
39+
cascade_evaluation: false
40+
cascade_thresholds: [0.5, 0.75]
41+
parallel_evaluations: 4
42+
use_llm_feedback: true
43+
llm_feedback_weight: 1.0
44+
45+
46+
# Evolution settings
47+
diff_based_evolution: false
48+
allow_full_rewrites: true

examples/lm_eval/evaluator_stub.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
def evaluate_stage1(file_path):
2+
return {"not_implemented": 0.0}
3+
4+
5+
def evaluate(file_path):
6+
return evaluate_stage1(file_path)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
insert the answer to the task here!

0 commit comments

Comments
 (0)