Skip to content

Commit 4eeb558

Browse files
committed
resolved merge conflicts
2 parents 213e0f7 + 4b099e3 commit 4eeb558

31 files changed

+1030
-225
lines changed

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
results/
2+
examples/lm_eval/prompts/system_message.txt
3+
examples/lm_eval/prompts/evaluator_system_message.txt
4+
15
# Python
26
__pycache__/
37
*.py[cod]
@@ -48,4 +52,4 @@ htmlcov/
4852

4953
# For SR
5054
secrets.yaml
51-
problems
55+
problems

Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,9 @@ docker-build:
4848
# Run the Docker container with the example
4949
.PHONY: docker-run
5050
docker-run:
51-
docker run --rm -v $(PROJECT_DIR):/app $(DOCKER_IMAGE) examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
51+
docker run --rm -v $(PROJECT_DIR):/app --network="host" $(DOCKER_IMAGE) examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
52+
53+
# Run the lm-eval benchmark
54+
.PHONY: lm-eval
55+
lm-eval:
56+
$(PYTHON) scripts/lm_eval/lm-eval.py

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ cat checkpoints/checkpoint_*/best_program_info.json | grep -A 10 metrics
133133
You can also install and execute via Docker:
134134
```bash
135135
docker build -t openevolve .
136-
docker run --rm -v $(pwd):/app openevolve examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
136+
docker run --rm -v $(pwd):/app --network="host" openevolve examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
137137
```
138138

139139
## Configuration

configs/default_config.yaml

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,21 @@ max_code_length: 10000 # Maximum allowed code length in character
1616

1717
# LLM configuration
1818
llm:
19-
# Primary model (used most frequently)
20-
primary_model: "gemini-2.0-flash-lite"
21-
primary_model_weight: 0.8 # Sampling weight for primary model
22-
23-
# Secondary model (used for occasional high-quality generations)
24-
secondary_model: "gemini-2.0-flash"
25-
secondary_model_weight: 0.2 # Sampling weight for secondary model
19+
# Models for evolution
20+
models:
21+
# List of available models with their weights
22+
- name: "gemini-2.0-flash-lite"
23+
weight: 0.8
24+
- name: "gemini-2.0-flash"
25+
weight: 0.2
26+
27+
# Models for LLM feedback
28+
evaluator_models:
29+
# List of available models with their weights
30+
- name: "gemini-2.0-flash-lite"
31+
weight: 0.8
32+
- name: "gemini-2.0-flash"
33+
weight: 0.2
2634

2735
# API configuration
2836
api_base: "https://generativelanguage.googleapis.com/v1beta/openai/" # Base URL for API (change for non-OpenAI models)
@@ -42,6 +50,7 @@ llm:
4250
prompt:
4351
template_dir: null # Custom directory for prompt templates
4452
system_message: "You are an expert coder helping to improve programs through evolution."
53+
evaluator_system_message: "You are an expert code reviewer."
4554

4655
# Number of examples to include in the prompt
4756
num_top_programs: 3 # Number of top-performing programs to include

examples/function_minimization/evaluator.py

Lines changed: 42 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
import importlib.util
66
import numpy as np
77
import time
8-
import multiprocessing
8+
import concurrent.futures
99
import traceback
10+
import signal
1011

1112

1213
def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5):
@@ -22,31 +23,13 @@ def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5):
2223
Returns:
2324
Result of the function or raises TimeoutError
2425
"""
25-
26-
def wrapper(queue, func, args, kwargs):
26+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
27+
future = executor.submit(func, *args, **kwargs)
2728
try:
28-
result = func(*args, **kwargs)
29-
queue.put(("success", result))
30-
except Exception as e:
31-
queue.put(("error", e))
32-
33-
queue = multiprocessing.Queue()
34-
process = multiprocessing.Process(target=wrapper, args=(queue, func, args, kwargs))
35-
process.start()
36-
process.join(timeout=timeout_seconds)
37-
38-
if process.is_alive():
39-
process.terminate()
40-
process.join()
41-
raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
42-
43-
if queue.empty():
44-
raise TimeoutError("Function ended without returning a result")
45-
46-
status, result = queue.get()
47-
if status == "error":
48-
raise result
49-
return result
29+
result = future.result(timeout=timeout_seconds)
30+
return result
31+
except concurrent.futures.TimeoutError:
32+
raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
5033

5134

5235
def safe_float(value):
@@ -107,15 +90,27 @@ def evaluate(program_path):
10790
# Run with timeout
10891
result = run_with_timeout(program.run_search, timeout_seconds=5)
10992

110-
# Check if we got a tuple of 3 values
111-
if not isinstance(result, tuple) or len(result) != 3:
93+
# Handle different result formats
94+
if isinstance(result, tuple):
95+
if len(result) == 3:
96+
x, y, value = result
97+
elif len(result) == 2:
98+
# Assume it's (x, y) and calculate value
99+
x, y = result
100+
# Calculate the function value since it wasn't returned
101+
value = np.sin(x) * np.cos(y) + np.sin(x * y) + (x**2 + y**2) / 20
102+
print(f"Trial {trial}: Got 2 values, calculated function value: {value}")
103+
else:
104+
print(
105+
f"Trial {trial}: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
106+
)
107+
continue
108+
else:
112109
print(
113-
f"Trial {trial}: Invalid result format, expected tuple of 3 values but got {type(result)}"
110+
f"Trial {trial}: Invalid result format, expected tuple but got {type(result)}"
114111
)
115112
continue
116113

117-
x, y, value = result
118-
119114
end_time = time.time()
120115

121116
# Ensure all values are float
@@ -264,15 +259,25 @@ def evaluate_stage1(program_path):
264259
# Run a single trial with timeout
265260
result = run_with_timeout(program.run_search, timeout_seconds=5)
266261

267-
# Check if we got a tuple of 3 values
268-
if not isinstance(result, tuple) or len(result) != 3:
269-
print(
270-
f"Stage 1: Invalid result format, expected tuple of 3 values but got {type(result)}"
271-
)
262+
# Handle different result formats
263+
if isinstance(result, tuple):
264+
if len(result) == 3:
265+
x, y, value = result
266+
elif len(result) == 2:
267+
# Assume it's (x, y) and calculate value
268+
x, y = result
269+
# Calculate the function value since it wasn't returned
270+
value = np.sin(x) * np.cos(y) + np.sin(x * y) + (x**2 + y**2) / 20
271+
print(f"Stage 1: Got 2 values, calculated function value: {value}")
272+
else:
273+
print(
274+
f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
275+
)
276+
return {"runs_successfully": 0.0, "error": "Invalid result format"}
277+
else:
278+
print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}")
272279
return {"runs_successfully": 0.0, "error": "Invalid result format"}
273280

274-
x, y, value = result
275-
276281
# Ensure all values are float
277282
x = safe_float(x)
278283
y = safe_float(y)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
scipy

examples/lm_eval/README.md

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# lm-eval.py
2+
3+
`lm-eval.py` provides basic benchmark capability for LLM feedback-based evolutionary task solving. The benchmark framework is [EleutherAI's lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
4+
5+
*Limitation:* Only generation-only tasks such as gsm8k are supported. This is because tasks that require loglikelihood probabilities are not well applicable to agents.
6+
7+
## Usage
8+
9+
```bash
10+
$ python3 examples/lm_eval/lm-eval.py -h
11+
usage: lm-eval.py [-h] [--config CONFIG] [--init_file INIT_FILE] [--evaluator_file EVALUATOR_FILE] [--iterations ITERATIONS] [--limit LIMIT] [--tasks TASKS]
12+
[--output_path OUTPUT_PATH]
13+
14+
OpenEvolve <-> lm-evaluation-harness adapter.
15+
16+
options:
17+
-h, --help show this help message and exit
18+
--config CONFIG config file
19+
--init_file INIT_FILE
20+
initial content file
21+
--evaluator_file EVALUATOR_FILE
22+
evaluator file
23+
--iterations ITERATIONS
24+
number of iterations
25+
--limit LIMIT limit the number of examples per task that are executed
26+
--tasks TASKS list of tasks to evaluate
27+
--output_path OUTPUT_PATH
28+
output path for results
29+
```
30+
31+
Early examples that **were meant to** indicate that more evolution iterations improve task performance -- I suspect the prompting may not be ideal yet:
32+
```
33+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 1
34+
[..]
35+
Headline metrics:
36+
gsm8k exact_match,strict-match 80.000%
37+
[..]
38+
39+
40+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 3
41+
[..]
42+
Headline metrics:
43+
gsm8k exact_match,strict-match 90.000%
44+
[..]
45+
46+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 10
47+
[..]
48+
Headline metrics:
49+
gsm8k exact_match,strict-match 80.000%
50+
[..]
51+
52+
$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 15
53+
[..]
54+
Headline metrics:
55+
gsm8k exact_match,strict-match 70.000%
56+
[..]
57+
```
58+
59+
## Warning
60+
61+
- Be aware that this is an early implementation. No extensive benchmarks have been executed so far. With a limit to 10 tasks and 10 iterations, the benchmark is meaningless as is.
62+
- Use the --limit parameter only for tests, not for metric generation.
63+
- Do not cite the metrics that result from the script execution blindly without reviewing the solution first.
64+
65+
## References
66+
67+
```bibtex
68+
@misc{eval-harness,
69+
author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
70+
title = {The Language Model Evaluation Harness},
71+
month = 07,
72+
year = 2024,
73+
publisher = {Zenodo},
74+
version = {v0.4.3},
75+
doi = {10.5281/zenodo.12608602},
76+
url = {https://zenodo.org/records/12608602}
77+
}
78+
```

examples/lm_eval/config.yml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
max_iterations: 1
2+
checkpoint_interval: 10
3+
log_level: "INFO"
4+
5+
# LLM configuration
6+
llm:
7+
primary_model: "gemma3:12b-it-qat"
8+
#primary_model: "gpt-4o"
9+
primary_model_weight: 0.8
10+
secondary_model: "gemma3:12b-it-qat"
11+
#secondary_model: "gpt-4.1"
12+
secondary_model_weight: 0.2
13+
# api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
14+
# api_base: "https://api.openai.com/v1/"
15+
api_base: "http://localhost:11434/v1/"
16+
api_key: "ollama"
17+
temperature: 0.7
18+
top_p: 0.95
19+
max_tokens: 4096
20+
21+
# Prompt configuration
22+
prompt:
23+
num_top_programs: 3
24+
use_template_stochasticity: true
25+
# System prompt is created dynamically during the benchmark in file system_message.txt!
26+
template_dir: "examples/lm_eval/prompts"
27+
28+
# Database configuration
29+
database:
30+
population_size: 50
31+
archive_size: 20
32+
num_islands: 3
33+
elite_selection_ratio: 0.2
34+
exploitation_ratio: 0.7
35+
36+
# Evaluator configuration
37+
evaluator:
38+
timeout: 60
39+
cascade_evaluation: false
40+
cascade_thresholds: [0.5, 0.75]
41+
parallel_evaluations: 4
42+
use_llm_feedback: true
43+
llm_feedback_weight: 1.0
44+
45+
46+
# Evolution settings
47+
diff_based_evolution: false
48+
allow_full_rewrites: true

examples/lm_eval/evaluator_stub.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
def evaluate_stage1(file_path):
2+
return {"not_implemented": 0.0}
3+
4+
5+
def evaluate(file_path):
6+
return evaluate_stage1(file_path)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
insert the answer to the task here!

0 commit comments

Comments
 (0)