Skip to content

Commit 161c701

Browse files
committed
refactor prompt templates
1 parent 1bff46c commit 161c701

20 files changed

+400
-114
lines changed

configs/default_config.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ database:
109109
# - "diversity": Code structure diversity
110110
#
111111
# You can mix built-in features with custom metrics from your evaluator:
112-
feature_dimensions: # Dimensions for MAP-Elites feature map
112+
feature_dimensions: # Dimensions for MAP-Elites feature map (for diversity, NOT fitness)
113113
- "complexity" # Code length (built-in)
114114
- "diversity" # Code diversity (built-in)
115115
# Example with custom features:
@@ -131,6 +131,9 @@ database:
131131

132132
# Evaluator configuration
133133
evaluator:
134+
# Fitness calculation: Uses 'combined_score' if available, otherwise averages
135+
# all metrics EXCEPT those listed in database.feature_dimensions
136+
134137
# General settings
135138
timeout: 300 # Maximum evaluation time in seconds
136139
max_retries: 3 # Maximum number of retries for evaluation

openevolve/database.py

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
from openevolve.config import DatabaseConfig
2121
from openevolve.utils.code_utils import calculate_edit_distance
22-
from openevolve.utils.metrics_utils import safe_numeric_average
22+
from openevolve.utils.metrics_utils import safe_numeric_average, get_fitness_score
2323

2424
logger = logging.getLogger(__name__)
2525

@@ -247,8 +247,8 @@ def add(
247247
existing_program_id = self.feature_map[feature_key]
248248
if existing_program_id in self.programs:
249249
existing_program = self.programs[existing_program_id]
250-
new_fitness = safe_numeric_average(program.metrics)
251-
existing_fitness = safe_numeric_average(existing_program.metrics)
250+
new_fitness = get_fitness_score(program.metrics, self.config.feature_dimensions)
251+
existing_fitness = get_fitness_score(existing_program.metrics, self.config.feature_dimensions)
252252
logger.info(
253253
"MAP-Elites cell improved: %s (fitness: %.3f -> %.3f)",
254254
coords_dict,
@@ -358,22 +358,15 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]:
358358
)
359359
if sorted_programs:
360360
logger.debug(f"Found best program by metric '{metric}': {sorted_programs[0].id}")
361-
elif self.programs and all("combined_score" in p.metrics for p in self.programs.values()):
362-
# Sort by combined_score if it exists (preferred method)
363-
sorted_programs = sorted(
364-
self.programs.values(), key=lambda p: p.metrics["combined_score"], reverse=True
365-
)
366-
if sorted_programs:
367-
logger.debug(f"Found best program by combined_score: {sorted_programs[0].id}")
368361
else:
369-
# Sort by average of all numeric metrics as fallback
362+
# Sort by fitness (excluding feature dimensions)
370363
sorted_programs = sorted(
371364
self.programs.values(),
372-
key=lambda p: safe_numeric_average(p.metrics),
365+
key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions),
373366
reverse=True,
374367
)
375368
if sorted_programs:
376-
logger.debug(f"Found best program by average metrics: {sorted_programs[0].id}")
369+
logger.debug(f"Found best program by fitness score: {sorted_programs[0].id}")
377370

378371
# Update the best program tracking if we found a better program
379372
if sorted_programs and (
@@ -444,7 +437,7 @@ def get_top_programs(
444437
# Sort by combined_score if available, otherwise by average of all numeric metrics
445438
sorted_programs = sorted(
446439
candidates,
447-
key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)),
440+
key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions),
448441
reverse=True,
449442
)
450443

@@ -718,7 +711,8 @@ def _calculate_feature_coords(self, program: Program) -> List[int]:
718711
if not program.metrics:
719712
bin_idx = 0
720713
else:
721-
avg_score = safe_numeric_average(program.metrics)
714+
# Use fitness score for "score" dimension (consistent with rest of system)
715+
avg_score = get_fitness_score(program.metrics, self.config.feature_dimensions)
722716
# Update stats and scale
723717
self._update_feature_stats("score", avg_score)
724718
scaled_value = self._scale_feature_value("score", avg_score)
@@ -818,7 +812,10 @@ def _feature_coords_to_key(self, coords: List[int]) -> str:
818812

819813
def _is_better(self, program1: Program, program2: Program) -> bool:
820814
"""
821-
Determine if program1 is better than program2
815+
Determine if program1 has better FITNESS than program2
816+
817+
Uses fitness calculation that excludes MAP-Elites feature dimensions
818+
to prevent pollution of fitness comparisons.
822819
823820
Args:
824821
program1: First program
@@ -837,15 +834,11 @@ def _is_better(self, program1: Program, program2: Program) -> bool:
837834
if not program1.metrics and program2.metrics:
838835
return False
839836

840-
# Check for combined_score first (this is the preferred metric)
841-
if "combined_score" in program1.metrics and "combined_score" in program2.metrics:
842-
return program1.metrics["combined_score"] > program2.metrics["combined_score"]
843-
844-
# Fallback to average of all numeric metrics
845-
avg1 = safe_numeric_average(program1.metrics)
846-
avg2 = safe_numeric_average(program2.metrics)
837+
# Compare fitness (excluding feature dimensions)
838+
fitness1 = get_fitness_score(program1.metrics, self.config.feature_dimensions)
839+
fitness2 = get_fitness_score(program2.metrics, self.config.feature_dimensions)
847840

848-
return avg1 > avg2
841+
return fitness1 > fitness2
849842

850843
def _update_archive(self, program: Program) -> None:
851844
"""
@@ -882,7 +875,7 @@ def _update_archive(self, program: Program) -> None:
882875
# Find worst program among valid programs
883876
if valid_archive_programs:
884877
worst_program = min(
885-
valid_archive_programs, key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics))
878+
valid_archive_programs, key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions)
886879
)
887880

888881
# Replace if new program is better
@@ -1287,7 +1280,7 @@ def _enforce_population_limit(self, exclude_program_id: Optional[str] = None) ->
12871280
# Sort by combined_score if available, otherwise by average metric (worst first)
12881281
sorted_programs = sorted(
12891282
all_programs,
1290-
key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)),
1283+
key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions),
12911284
)
12921285

12931286
# Remove worst programs, but never remove the best program or excluded program
@@ -1387,7 +1380,7 @@ def migrate_programs(self) -> None:
13871380

13881381
# Sort by fitness (using combined_score or average metrics)
13891382
island_programs.sort(
1390-
key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)),
1383+
key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions),
13911384
reverse=True,
13921385
)
13931386

@@ -1558,7 +1551,7 @@ def get_island_stats(self) -> List[dict]:
15581551

15591552
if island_programs:
15601553
scores = [
1561-
p.metrics.get("combined_score", safe_numeric_average(p.metrics))
1554+
get_fitness_score(p.metrics, self.config.feature_dimensions)
15621555
for p in island_programs
15631556
]
15641557

openevolve/evaluator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,8 +563,9 @@ async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[s
563563

564564
try:
565565
# Create prompt for LLM
566+
feature_dimensions = self.database.config.feature_dimensions if self.database else []
566567
prompt = self.prompt_sampler.build_prompt(
567-
current_program=program_code, template_key="evaluation"
568+
current_program=program_code, template_key="evaluation", feature_dimensions=feature_dimensions
568569
)
569570

570571
# Get LLM response

openevolve/iteration.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ async def run_iteration_with_shared_db(
7070
evolution_round=iteration,
7171
diff_based_evolution=config.diff_based_evolution,
7272
program_artifacts=parent_artifacts if parent_artifacts else None,
73+
feature_dimensions=database.config.feature_dimensions,
7374
)
7475

7576
result = Result(parent=parent)

openevolve/process_parallel.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ def _run_iteration_worker(
172172
evolution_round=iteration,
173173
diff_based_evolution=_worker_config.diff_based_evolution,
174174
program_artifacts=parent_artifacts,
175+
feature_dimensions=db_snapshot.get("feature_dimensions", []),
175176
)
176177

177178
iteration_start = time.time()
@@ -349,6 +350,7 @@ def _create_database_snapshot(self) -> Dict[str, Any]:
349350
"programs": {pid: prog.to_dict() for pid, prog in self.database.programs.items()},
350351
"islands": [list(island) for island in self.database.islands],
351352
"current_island": self.database.current_island,
353+
"feature_dimensions": self.database.config.feature_dimensions,
352354
"artifacts": {}, # Will be populated selectively
353355
}
354356

openevolve/prompt/sampler.py

Lines changed: 66 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from openevolve.config import PromptConfig
1010
from openevolve.prompt.templates import TemplateManager
1111
from openevolve.utils.format_utils import format_metrics_safe
12-
from openevolve.utils.metrics_utils import safe_numeric_average
12+
from openevolve.utils.metrics_utils import safe_numeric_average, get_fitness_score, format_feature_coordinates
1313

1414
logger = logging.getLogger(__name__)
1515

@@ -19,7 +19,7 @@ class PromptSampler:
1919

2020
def __init__(self, config: PromptConfig):
2121
self.config = config
22-
self.template_manager = TemplateManager(config.template_dir)
22+
self.template_manager = TemplateManager(custom_template_dir=config.template_dir)
2323

2424
# Initialize the random number generator
2525
random.seed()
@@ -60,6 +60,7 @@ def build_prompt(
6060
diff_based_evolution: bool = True,
6161
template_key: Optional[str] = None,
6262
program_artifacts: Optional[Dict[str, Union[str, bytes]]] = None,
63+
feature_dimensions: Optional[List[str]] = None,
6364
**kwargs: Any,
6465
) -> Dict[str, str]:
6566
"""
@@ -110,7 +111,7 @@ def build_prompt(
110111

111112
# Identify areas for improvement
112113
improvement_areas = self._identify_improvement_areas(
113-
current_program, parent_program, program_metrics, previous_programs
114+
current_program, parent_program, program_metrics, previous_programs, feature_dimensions
114115
)
115116

116117
# Format evolution history
@@ -127,9 +128,17 @@ def build_prompt(
127128
if self.config.use_template_stochasticity:
128129
user_template = self._apply_template_variations(user_template)
129130

131+
# Calculate fitness and feature coordinates for the new template format
132+
feature_dimensions = feature_dimensions or []
133+
fitness_score = get_fitness_score(program_metrics, feature_dimensions)
134+
feature_coords = format_feature_coordinates(program_metrics, feature_dimensions)
135+
130136
# Format the final user message
131137
user_message = user_template.format(
132138
metrics=metrics_str,
139+
fitness_score=f"{fitness_score:.4f}",
140+
feature_coords=feature_coords,
141+
feature_dimensions=", ".join(feature_dimensions) if feature_dimensions else "None",
133142
improvement_areas=improvement_areas,
134143
evolution_history=evolution_history,
135144
current_program=current_program,
@@ -163,74 +172,70 @@ def _identify_improvement_areas(
163172
parent_program: str,
164173
metrics: Dict[str, float],
165174
previous_programs: List[Dict[str, Any]],
175+
feature_dimensions: Optional[List[str]] = None,
166176
) -> str:
167-
"""Identify potential areas for improvement"""
168-
# This method could be expanded to include more sophisticated analysis
169-
# For now, we'll use a simple approach
170-
177+
"""Identify improvement areas with proper fitness/feature separation"""
178+
171179
improvement_areas = []
172-
173-
# Check program length
174-
# Support both old and new parameter names for backward compatibility
180+
feature_dimensions = feature_dimensions or []
181+
182+
# Calculate fitness (excluding feature dimensions)
183+
current_fitness = get_fitness_score(metrics, feature_dimensions)
184+
185+
# Track fitness changes (not individual metrics)
186+
if previous_programs:
187+
prev_metrics = previous_programs[-1].get("metrics", {})
188+
prev_fitness = get_fitness_score(prev_metrics, feature_dimensions)
189+
190+
if current_fitness > prev_fitness:
191+
msg = self.template_manager.get_fragment(
192+
"fitness_improved",
193+
prev=prev_fitness,
194+
current=current_fitness
195+
)
196+
improvement_areas.append(msg)
197+
elif current_fitness < prev_fitness:
198+
msg = self.template_manager.get_fragment(
199+
"fitness_declined",
200+
prev=prev_fitness,
201+
current=current_fitness
202+
)
203+
improvement_areas.append(msg)
204+
elif abs(current_fitness - prev_fitness) < 1e-6: # Essentially unchanged
205+
msg = self.template_manager.get_fragment(
206+
"fitness_stable",
207+
current=current_fitness
208+
)
209+
improvement_areas.append(msg)
210+
211+
# Note feature exploration (not good/bad, just informational)
212+
if feature_dimensions:
213+
feature_coords = format_feature_coordinates(metrics, feature_dimensions)
214+
if feature_coords != "No feature coordinates":
215+
msg = self.template_manager.get_fragment(
216+
"exploring_region",
217+
features=feature_coords
218+
)
219+
improvement_areas.append(msg)
220+
221+
# Code length check (configurable threshold)
175222
threshold = (
176223
self.config.suggest_simplification_after_chars or self.config.code_length_threshold
177224
)
178225
if threshold and len(current_program) > threshold:
179-
improvement_areas.append(
180-
"Consider simplifying the code to improve readability and maintainability"
226+
msg = self.template_manager.get_fragment(
227+
"code_too_long",
228+
threshold=threshold
181229
)
182-
183-
# Check for performance patterns in previous attempts
184-
if len(previous_programs) >= 2:
185-
recent_attempts = previous_programs[-2:]
186-
metrics_improved = []
187-
metrics_regressed = []
188-
189-
for metric, value in metrics.items():
190-
# Only compare numeric metrics
191-
if not isinstance(value, (int, float)) or isinstance(value, bool):
192-
continue
193-
194-
improved = True
195-
regressed = True
196-
197-
for attempt in recent_attempts:
198-
attempt_value = attempt["metrics"].get(metric, 0)
199-
# Only compare if both values are numeric
200-
if isinstance(value, (int, float)) and isinstance(attempt_value, (int, float)):
201-
if attempt_value <= value:
202-
regressed = False
203-
if attempt_value >= value:
204-
improved = False
205-
else:
206-
# If either value is non-numeric, skip comparison
207-
improved = False
208-
regressed = False
209-
210-
if improved and metric not in metrics_improved:
211-
metrics_improved.append(metric)
212-
if regressed and metric not in metrics_regressed:
213-
metrics_regressed.append(metric)
214-
215-
if metrics_improved:
216-
improvement_areas.append(
217-
f"Metrics showing improvement: {', '.join(metrics_improved)}. "
218-
"Consider continuing with similar changes."
219-
)
220-
221-
if metrics_regressed:
222-
improvement_areas.append(
223-
f"Metrics showing regression: {', '.join(metrics_regressed)}. "
224-
"Consider reverting or revising recent changes in these areas."
225-
)
226-
227-
# If we don't have specific improvements to suggest
230+
improvement_areas.append(msg)
231+
232+
# Default guidance if nothing specific
228233
if not improvement_areas:
229234
improvement_areas.append(
230-
"Focus on optimizing the code for better performance on the target metrics"
235+
self.template_manager.get_fragment("no_specific_guidance")
231236
)
232-
233-
return "\n".join([f"- {area}" for area in improvement_areas])
237+
238+
return "\n".join(f"- {area}" for area in improvement_areas)
234239

235240
def _format_evolution_history(
236241
self,

0 commit comments

Comments
 (0)