Skip to content

Commit f84be60

Browse files
committed
Better support for LLM feedback and handling of LLM ensembles.
- config.py supports configuration of n-model ensembles for evolution, and, optionally a separate ensemble for evaluation; backwards compatible yaml format; settings can be made for all models in llm: or for a specific model in llm:models; new evaluator_system_message setting - ensemble.py supports n model ensembles - OpenAILLM supports individual parameter config per model - ensemble.py has a new generate_all_with_context() function - evaluator.py uses prompt sampler to generate llm feedback prompts - templates.py contains default prompts for llm feedback With the function_minimization example, set use_llm_feedback: true in its config.yaml. The LLM feedback will provide output such as ` { "readability": 0.92, "maintainability": 0.88, "efficiency": 0.82, "reasoning": "The code is quite readable, with clear function and variable names, concise comments, and a docstring explaining the purpose and arguments of the main search function. There is some minor room for improvement, such as splitting up large inner loops or extracting repeated logic, but overall it is easy to follow. Maintainability is high due to modularization and descriptive naming, but could be slightly improved by reducing the nesting level and possibly moving the annealing routine to its own top-level function. Efficiency is good for a simple global optimization approach; vectorized numpy operations are used where appropriate, and the population-based simulated annealing is a reasonable trade-off between exploration and exploitation. However, the algorithm could be further optimized (e.g., by fully vectorizing more of the walker updates or parallelizing restarts), and the approach is not the most efficient for high-dimensional or more complex landscapes." } ` The evolution can then consider the additional values: ` Evolution complete! Best program metrics: runs_successfully: 1.0000 value_score: 0.9997 distance_score: 0.9991 overall_score: 0.9905 standard_deviation_score: 0.9992 speed_score: 0.0610 reliability_score: 1.0000 combined_score: 0.9525 success_rate: 1.0000 llm_readability: 0.0904 llm_maintainability: 0.0816 llm_efficiency: 0.0764 Note: I did not evaluate the results yet.
1 parent aa27227 commit f84be60

File tree

9 files changed

+295
-124
lines changed

9 files changed

+295
-124
lines changed

configs/default_config.yaml

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,21 @@ max_code_length: 10000 # Maximum allowed code length in character
1616

1717
# LLM configuration
1818
llm:
19-
# Primary model (used most frequently)
20-
primary_model: "gemini-2.0-flash-lite"
21-
primary_model_weight: 0.8 # Sampling weight for primary model
22-
23-
# Secondary model (used for occasional high-quality generations)
24-
secondary_model: "gemini-2.0-flash"
25-
secondary_model_weight: 0.2 # Sampling weight for secondary model
19+
# Models for evolution
20+
models:
21+
# List of available models with their weights
22+
- name: "gemini-2.0-flash-lite"
23+
weight: 0.8
24+
- name: "gemini-2.0-flash"
25+
weight: 0.2
26+
27+
# Models for LLM feedback
28+
evaluator_models:
29+
# List of available models with their weights
30+
- name: "gemini-2.0-flash-lite"
31+
weight: 0.8
32+
- name: "gemini-2.0-flash"
33+
weight: 0.2
2634

2735
# API configuration
2836
api_base: "https://generativelanguage.googleapis.com/v1beta/openai/" # Base URL for API (change for non-OpenAI models)
@@ -42,6 +50,7 @@ llm:
4250
prompt:
4351
template_dir: null # Custom directory for prompt templates
4452
system_message: "You are an expert coder helping to improve programs through evolution."
53+
evaluator_system_message: "You are an expert code reviewer."
4554

4655
# Number of examples to include in the prompt
4756
num_top_programs: 3 # Number of top-performing programs to include

openevolve/config.py

Lines changed: 103 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -11,22 +11,41 @@
1111

1212

1313
@dataclass
14-
class LLMConfig:
15-
"""Configuration for LLM models"""
14+
class LLMModelConfig:
15+
"""Configuration for a single LLM model"""
1616

17-
# Primary model
18-
primary_model: str = "gemini-2.0-flash-lite"
19-
primary_model_weight: float = 0.8
17+
# API configuration
18+
api_base: str = None
19+
api_key: Optional[str] = None
20+
name: str = None
2021

21-
# Secondary model
22-
secondary_model: str = "gemini-2.0-flash"
23-
secondary_model_weight: float = 0.2
22+
# Weight for model in ensemble
23+
weight: float = 1.0
24+
25+
# Generation parameters
26+
system_message: Optional[str] = None
27+
temperature: float = None
28+
top_p: float = None
29+
max_tokens: int = None
30+
31+
# Request parameters
32+
timeout: int = None
33+
retries: int = None
34+
retry_delay: int = None
35+
36+
37+
@dataclass
38+
class LLMConfig(LLMModelConfig):
39+
"""Configuration for LLM models"""
2440

2541
# API configuration
2642
api_base: str = "https://api.openai.com/v1"
27-
api_key: Optional[str] = None
43+
name: str = "gpt-4o"
2844

2945
# Generation parameters
46+
system_message: Optional[str] = (
47+
"You are an expert coder helping to improve programs through evolution."
48+
)
3049
temperature: float = 0.7
3150
top_p: float = 0.95
3251
max_tokens: int = 4096
@@ -36,13 +55,69 @@ class LLMConfig:
3655
retries: int = 3
3756
retry_delay: int = 5
3857

58+
# n-model configuration for evolution LLM ensemble
59+
models: List[LLMModelConfig] = field(default_factory=lambda: [LLMModelConfig()])
60+
61+
# n-model configuration for evaluator LLM ensemble
62+
evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: [])
63+
64+
# Backwardes compatibility with primary_model(_weight) options
65+
primary_model: str = "gemini-2.0-flash-lite"
66+
primary_model_weight: float = 0.8
67+
secondary_model: str = "gemini-2.0-flash"
68+
secondary_model_weight: float = 0.2
69+
70+
def __post_init__(self):
71+
"""Post-initialization to set up model configurations"""
72+
# Handle backward compatibility for primary_model(_weight) and secondary_model(_weight).
73+
if (self.primary_model or self.primary_model_weight) and len(self.models) < 1:
74+
# Ensure we have a primary model
75+
self.models.append(LLMModelConfig())
76+
if self.primary_model:
77+
self.models[0].name = self.primary_model
78+
if self.primary_model_weight:
79+
self.models[0].weight = self.primary_model_weight
80+
81+
if (self.secondary_model or self.secondary_model_weight) and len(self.models) < 2:
82+
# Ensure we have a second model
83+
self.models.append(LLMModelConfig())
84+
if self.secondary_model:
85+
self.models[1].name = self.secondary_model
86+
if self.secondary_model_weight:
87+
self.models[1].weight = self.secondary_model_weight
88+
89+
# If no evaluator models are defined, use the same models as for evolution
90+
if not self.evaluator_models or len(self.evaluator_models) < 1:
91+
self.evaluator_models = self.models.copy()
92+
93+
# Update models with shared configuration values
94+
shared_config = {
95+
"api_base": self.api_base,
96+
"api_key": self.api_key,
97+
"temperature": self.temperature,
98+
"top_p": self.top_p,
99+
"max_tokens": self.max_tokens,
100+
"timeout": self.timeout,
101+
"retries": self.retries,
102+
"retry_delay": self.retry_delay,
103+
}
104+
self.update_model_params(shared_config)
105+
106+
def update_model_params(self, args: Dict[str, Any], overwrite: bool = False) -> None:
107+
"""Update model parameters for all models"""
108+
for model in self.models + self.evaluator_models:
109+
for key, value in args.items():
110+
if overwrite or getattr(model, key, None) is None:
111+
setattr(model, key, value)
112+
39113

40114
@dataclass
41115
class PromptConfig:
42116
"""Configuration for prompt generation"""
43117

44118
template_dir: Optional[str] = None
45119
system_message: str = "You are an expert coder helping to improve programs through evolution."
120+
evaluator_system_message: str = """You are an expert code reviewer."""
46121

47122
# Number of examples to include in the prompt
48123
num_top_programs: int = 3
@@ -155,7 +230,12 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> "Config":
155230

156231
# Update nested configs
157232
if "llm" in config_dict:
158-
config.llm = LLMConfig(**config_dict["llm"])
233+
llm_dict = config_dict["llm"]
234+
if "models" in llm_dict:
235+
llm_dict["models"] = [LLMModelConfig(**m) for m in llm_dict["models"]]
236+
if "evaluator_models" in llm_dict:
237+
llm_dict["evaluator_models"] = [LLMModelConfig(**m) for m in llm_dict["evaluator_models"]]
238+
config.llm = LLMConfig(**llm_dict)
159239
if "prompt" in config_dict:
160240
config.prompt = PromptConfig(**config_dict["prompt"])
161241
if "database" in config_dict:
@@ -176,10 +256,8 @@ def to_dict(self) -> Dict[str, Any]:
176256
"random_seed": self.random_seed,
177257
# Component configurations
178258
"llm": {
179-
"primary_model": self.llm.primary_model,
180-
"primary_model_weight": self.llm.primary_model_weight,
181-
"secondary_model": self.llm.secondary_model,
182-
"secondary_model_weight": self.llm.secondary_model_weight,
259+
"models": self.llm.models,
260+
"evaluator_models": self.llm.evaluator_models,
183261
"api_base": self.llm.api_base,
184262
"temperature": self.llm.temperature,
185263
"top_p": self.llm.top_p,
@@ -191,6 +269,7 @@ def to_dict(self) -> Dict[str, Any]:
191269
"prompt": {
192270
"template_dir": self.prompt.template_dir,
193271
"system_message": self.prompt.system_message,
272+
"evaluator_system_message": self.prompt.evaluator_system_message,
194273
"num_top_programs": self.prompt.num_top_programs,
195274
"num_diverse_programs": self.prompt.num_diverse_programs,
196275
"use_template_stochasticity": self.prompt.use_template_stochasticity,
@@ -245,16 +324,17 @@ def to_yaml(self, path: Union[str, Path]) -> None:
245324
def load_config(config_path: Optional[Union[str, Path]] = None) -> Config:
246325
"""Load configuration from a YAML file or use defaults"""
247326
if config_path and os.path.exists(config_path):
248-
return Config.from_yaml(config_path)
327+
config = Config.from_yaml(config_path)
328+
else:
329+
config = Config()
330+
331+
# Use environment variables if available
332+
api_key = os.environ.get("OPENAI_API_KEY")
333+
api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
249334

250-
# Use environment variables if available
251-
api_key = os.environ.get("OPENAI_API_KEY")
252-
api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
335+
config.llm.update_model_params({"api_key": api_key, "api_base": api_base})
253336

254-
config = Config()
255-
if api_key:
256-
config.llm.api_key = api_key
257-
if api_base:
258-
config.llm.api_base = api_base
337+
# Make the system message available to the individual models, in case it is not provided from the prompt sampler
338+
config.llm.update_model_params({"system_message": config.prompt.system_message})
259339

260340
return config

openevolve/controller.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,15 +92,25 @@ def __init__(
9292
self.file_extension = f".{self.file_extension}"
9393

9494
# Initialize components
95-
self.llm_ensemble = LLMEnsemble(self.config.llm)
95+
self.llm_ensemble = LLMEnsemble(self.config.llm.models)
96+
self.llm_evaluator_ensemble = LLMEnsemble(self.config.llm.evaluator_models)
97+
9698
self.prompt_sampler = PromptSampler(self.config.prompt)
99+
self.evaluator_prompt_sampler = PromptSampler(self.config.prompt)
100+
self.evaluator_prompt_sampler.set_templates("evaluator_system_message")
97101

98102
# Pass random seed to database if specified
99103
if self.config.random_seed is not None:
100104
self.config.database.random_seed = self.config.random_seed
101105

102106
self.database = ProgramDatabase(self.config.database)
103-
self.evaluator = Evaluator(self.config.evaluator, evaluation_file, self.llm_ensemble)
107+
108+
self.evaluator = Evaluator(
109+
self.config.evaluator,
110+
evaluation_file,
111+
self.llm_evaluator_ensemble,
112+
self.evaluator_prompt_sampler,
113+
)
104114

105115
logger.info(f"Initialized OpenEvolve with {initial_program_path} " f"and {evaluation_file}")
106116

openevolve/evaluator.py

Lines changed: 48 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@
1414
import uuid
1515
from pathlib import Path
1616
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
17+
import traceback
1718

1819
from openevolve.config import EvaluatorConfig
1920
from openevolve.llm.ensemble import LLMEnsemble
2021
from openevolve.utils.async_utils import TaskPool, run_in_executor
22+
from openevolve.prompt.sampler import PromptSampler
2123

2224
logger = logging.getLogger(__name__)
2325

@@ -35,10 +37,12 @@ def __init__(
3537
config: EvaluatorConfig,
3638
evaluation_file: str,
3739
llm_ensemble: Optional[LLMEnsemble] = None,
40+
prompt_sampler: Optional[PromptSampler] = None,
3841
):
3942
self.config = config
4043
self.evaluation_file = evaluation_file
4144
self.llm_ensemble = llm_ensemble
45+
self.prompt_sampler = prompt_sampler
4246

4347
# Create a task pool for parallel evaluation
4448
self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations)
@@ -285,67 +289,66 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
285289

286290
try:
287291
# Create prompt for LLM
288-
prompt = f"""
289-
Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics:
290-
1. Readability: How easy is the code to read and understand?
291-
2. Maintainability: How easy would the code be to maintain and modify?
292-
3. Efficiency: How efficient is the code in terms of time and space complexity?
293-
294-
For each metric, provide a score between 0.0 and 1.0, where 1.0 is best.
295-
296-
Code to evaluate:
297-
```python
298-
{program_code}
299-
```
300-
301-
Return your evaluation as a JSON object with the following format:
302-
{{
303-
"readability": [score],
304-
"maintainability": [score],
305-
"efficiency": [score],
306-
"reasoning": "[brief explanation of scores]"
307-
}}
308-
"""
292+
prompt = self.prompt_sampler.build_prompt(
293+
current_program=program_code, template_key="evaluation"
294+
)
309295

310296
# Get LLM response
311-
response = await self.llm_ensemble.generate(prompt)
297+
responses = await self.llm_ensemble.generate_all_with_context(
298+
prompt["system"], [{"role": "user", "content": prompt["user"]}]
299+
)
312300

313301
# Extract JSON from response
314302
try:
315303
# Try to find JSON block
316304
json_pattern = r"```json\n(.*?)\n```"
317305
import re
318306

319-
json_match = re.search(json_pattern, response, re.DOTALL)
320-
321-
if json_match:
322-
json_str = json_match.group(1)
323-
else:
324-
# Try to extract JSON directly
325-
json_str = response
326-
# Remove non-JSON parts
327-
start_idx = json_str.find("{")
328-
end_idx = json_str.rfind("}") + 1
329-
if start_idx >= 0 and end_idx > start_idx:
330-
json_str = json_str[start_idx:end_idx]
331-
332-
# Parse JSON
333-
result = json.loads(json_str)
334-
335-
# Extract metrics
336-
metrics = {}
337-
for key in ["readability", "maintainability", "efficiency"]:
338-
if key in result:
339-
metrics[key] = float(result[key])
340-
341-
return metrics
307+
avg_metrics = {}
308+
for i, response in enumerate(responses):
309+
json_match = re.search(json_pattern, response, re.DOTALL)
310+
311+
if json_match:
312+
json_str = json_match.group(1)
313+
else:
314+
# Try to extract JSON directly
315+
json_str = response
316+
# Remove non-JSON parts
317+
start_idx = json_str.find("{")
318+
end_idx = json_str.rfind("}") + 1
319+
if start_idx >= 0 and end_idx > start_idx:
320+
json_str = json_str[start_idx:end_idx]
321+
322+
# Parse JSON
323+
result = json.loads(json_str)
324+
325+
# Filter all non-numeric values
326+
metrics = {
327+
name: float(value)
328+
for name, value in result.items()
329+
if isinstance(value, (int, float))
330+
}
331+
332+
# Weight of the model in the ensemble
333+
weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
334+
335+
# Average the metrics
336+
for name, value in metrics.items():
337+
if name in avg_metrics:
338+
avg_metrics[name] += value * weight
339+
else:
340+
avg_metrics[name] = value * weight
341+
342+
return avg_metrics
342343

343344
except Exception as e:
344345
logger.warning(f"Error parsing LLM response: {str(e)}")
346+
traceback.print_exc()
345347
return {}
346348

347349
except Exception as e:
348350
logger.error(f"Error in LLM evaluation: {str(e)}")
351+
traceback.print_exc()
349352
return {}
350353

351354
def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool:

0 commit comments

Comments
 (0)