Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion configs/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ database:
# - "diversity": Code structure diversity
#
# You can mix built-in features with custom metrics from your evaluator:
feature_dimensions: # Dimensions for MAP-Elites feature map
feature_dimensions: # Dimensions for MAP-Elites feature map (for diversity, NOT fitness)
- "complexity" # Code length (built-in)
- "diversity" # Code diversity (built-in)
# Example with custom features:
Expand All @@ -131,6 +131,9 @@ database:

# Evaluator configuration
evaluator:
# Fitness calculation: Uses 'combined_score' if available, otherwise averages
# all metrics EXCEPT those listed in database.feature_dimensions

# General settings
timeout: 300 # Maximum evaluation time in seconds
max_retries: 3 # Maximum number of retries for evaluation
Expand Down
2 changes: 1 addition & 1 deletion openevolve/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Version information for openevolve package."""

__version__ = "0.1.3"
__version__ = "0.2.0"
49 changes: 21 additions & 28 deletions openevolve/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from openevolve.config import DatabaseConfig
from openevolve.utils.code_utils import calculate_edit_distance
from openevolve.utils.metrics_utils import safe_numeric_average
from openevolve.utils.metrics_utils import safe_numeric_average, get_fitness_score

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -247,8 +247,8 @@ def add(
existing_program_id = self.feature_map[feature_key]
if existing_program_id in self.programs:
existing_program = self.programs[existing_program_id]
new_fitness = safe_numeric_average(program.metrics)
existing_fitness = safe_numeric_average(existing_program.metrics)
new_fitness = get_fitness_score(program.metrics, self.config.feature_dimensions)
existing_fitness = get_fitness_score(existing_program.metrics, self.config.feature_dimensions)
logger.info(
"MAP-Elites cell improved: %s (fitness: %.3f -> %.3f)",
coords_dict,
Expand Down Expand Up @@ -358,22 +358,15 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]:
)
if sorted_programs:
logger.debug(f"Found best program by metric '{metric}': {sorted_programs[0].id}")
elif self.programs and all("combined_score" in p.metrics for p in self.programs.values()):
# Sort by combined_score if it exists (preferred method)
sorted_programs = sorted(
self.programs.values(), key=lambda p: p.metrics["combined_score"], reverse=True
)
if sorted_programs:
logger.debug(f"Found best program by combined_score: {sorted_programs[0].id}")
else:
# Sort by average of all numeric metrics as fallback
# Sort by fitness (excluding feature dimensions)
sorted_programs = sorted(
self.programs.values(),
key=lambda p: safe_numeric_average(p.metrics),
key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions),
reverse=True,
)
if sorted_programs:
logger.debug(f"Found best program by average metrics: {sorted_programs[0].id}")
logger.debug(f"Found best program by fitness score: {sorted_programs[0].id}")

# Update the best program tracking if we found a better program
if sorted_programs and (
Expand Down Expand Up @@ -444,7 +437,7 @@ def get_top_programs(
# Sort by combined_score if available, otherwise by average of all numeric metrics
sorted_programs = sorted(
candidates,
key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)),
key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions),
reverse=True,
)

Expand Down Expand Up @@ -718,7 +711,8 @@ def _calculate_feature_coords(self, program: Program) -> List[int]:
if not program.metrics:
bin_idx = 0
else:
avg_score = safe_numeric_average(program.metrics)
# Use fitness score for "score" dimension (consistent with rest of system)
avg_score = get_fitness_score(program.metrics, self.config.feature_dimensions)
# Update stats and scale
self._update_feature_stats("score", avg_score)
scaled_value = self._scale_feature_value("score", avg_score)
Expand Down Expand Up @@ -818,7 +812,10 @@ def _feature_coords_to_key(self, coords: List[int]) -> str:

def _is_better(self, program1: Program, program2: Program) -> bool:
"""
Determine if program1 is better than program2
Determine if program1 has better FITNESS than program2

Uses fitness calculation that excludes MAP-Elites feature dimensions
to prevent pollution of fitness comparisons.

Args:
program1: First program
Expand All @@ -837,15 +834,11 @@ def _is_better(self, program1: Program, program2: Program) -> bool:
if not program1.metrics and program2.metrics:
return False

# Check for combined_score first (this is the preferred metric)
if "combined_score" in program1.metrics and "combined_score" in program2.metrics:
return program1.metrics["combined_score"] > program2.metrics["combined_score"]

# Fallback to average of all numeric metrics
avg1 = safe_numeric_average(program1.metrics)
avg2 = safe_numeric_average(program2.metrics)
# Compare fitness (excluding feature dimensions)
fitness1 = get_fitness_score(program1.metrics, self.config.feature_dimensions)
fitness2 = get_fitness_score(program2.metrics, self.config.feature_dimensions)

return avg1 > avg2
return fitness1 > fitness2

def _update_archive(self, program: Program) -> None:
"""
Expand Down Expand Up @@ -882,7 +875,7 @@ def _update_archive(self, program: Program) -> None:
# Find worst program among valid programs
if valid_archive_programs:
worst_program = min(
valid_archive_programs, key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics))
valid_archive_programs, key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions)
)

# Replace if new program is better
Expand Down Expand Up @@ -1287,7 +1280,7 @@ def _enforce_population_limit(self, exclude_program_id: Optional[str] = None) ->
# Sort by combined_score if available, otherwise by average metric (worst first)
sorted_programs = sorted(
all_programs,
key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)),
key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions),
)

# Remove worst programs, but never remove the best program or excluded program
Expand Down Expand Up @@ -1387,7 +1380,7 @@ def migrate_programs(self) -> None:

# Sort by fitness (using combined_score or average metrics)
island_programs.sort(
key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)),
key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions),
reverse=True,
)

Expand Down Expand Up @@ -1558,7 +1551,7 @@ def get_island_stats(self) -> List[dict]:

if island_programs:
scores = [
p.metrics.get("combined_score", safe_numeric_average(p.metrics))
get_fitness_score(p.metrics, self.config.feature_dimensions)
for p in island_programs
]

Expand Down
3 changes: 2 additions & 1 deletion openevolve/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,8 +563,9 @@ async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[s

try:
# Create prompt for LLM
feature_dimensions = self.database.config.feature_dimensions if self.database else []
prompt = self.prompt_sampler.build_prompt(
current_program=program_code, template_key="evaluation"
current_program=program_code, template_key="evaluation", feature_dimensions=feature_dimensions
)

# Get LLM response
Expand Down
1 change: 1 addition & 0 deletions openevolve/iteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ async def run_iteration_with_shared_db(
evolution_round=iteration,
diff_based_evolution=config.diff_based_evolution,
program_artifacts=parent_artifacts if parent_artifacts else None,
feature_dimensions=database.config.feature_dimensions,
)

result = Result(parent=parent)
Expand Down
2 changes: 2 additions & 0 deletions openevolve/process_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ def _run_iteration_worker(
evolution_round=iteration,
diff_based_evolution=_worker_config.diff_based_evolution,
program_artifacts=parent_artifacts,
feature_dimensions=db_snapshot.get("feature_dimensions", []),
)

iteration_start = time.time()
Expand Down Expand Up @@ -349,6 +350,7 @@ def _create_database_snapshot(self) -> Dict[str, Any]:
"programs": {pid: prog.to_dict() for pid, prog in self.database.programs.items()},
"islands": [list(island) for island in self.database.islands],
"current_island": self.database.current_island,
"feature_dimensions": self.database.config.feature_dimensions,
"artifacts": {}, # Will be populated selectively
}

Expand Down
127 changes: 66 additions & 61 deletions openevolve/prompt/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from openevolve.config import PromptConfig
from openevolve.prompt.templates import TemplateManager
from openevolve.utils.format_utils import format_metrics_safe
from openevolve.utils.metrics_utils import safe_numeric_average
from openevolve.utils.metrics_utils import safe_numeric_average, get_fitness_score, format_feature_coordinates

logger = logging.getLogger(__name__)

Expand All @@ -19,7 +19,7 @@ class PromptSampler:

def __init__(self, config: PromptConfig):
self.config = config
self.template_manager = TemplateManager(config.template_dir)
self.template_manager = TemplateManager(custom_template_dir=config.template_dir)

# Initialize the random number generator
random.seed()
Expand Down Expand Up @@ -60,6 +60,7 @@ def build_prompt(
diff_based_evolution: bool = True,
template_key: Optional[str] = None,
program_artifacts: Optional[Dict[str, Union[str, bytes]]] = None,
feature_dimensions: Optional[List[str]] = None,
**kwargs: Any,
) -> Dict[str, str]:
"""
Expand Down Expand Up @@ -110,7 +111,7 @@ def build_prompt(

# Identify areas for improvement
improvement_areas = self._identify_improvement_areas(
current_program, parent_program, program_metrics, previous_programs
current_program, parent_program, program_metrics, previous_programs, feature_dimensions
)

# Format evolution history
Expand All @@ -127,9 +128,17 @@ def build_prompt(
if self.config.use_template_stochasticity:
user_template = self._apply_template_variations(user_template)

# Calculate fitness and feature coordinates for the new template format
feature_dimensions = feature_dimensions or []
fitness_score = get_fitness_score(program_metrics, feature_dimensions)
feature_coords = format_feature_coordinates(program_metrics, feature_dimensions)

# Format the final user message
user_message = user_template.format(
metrics=metrics_str,
fitness_score=f"{fitness_score:.4f}",
feature_coords=feature_coords,
feature_dimensions=", ".join(feature_dimensions) if feature_dimensions else "None",
improvement_areas=improvement_areas,
evolution_history=evolution_history,
current_program=current_program,
Expand Down Expand Up @@ -163,74 +172,70 @@ def _identify_improvement_areas(
parent_program: str,
metrics: Dict[str, float],
previous_programs: List[Dict[str, Any]],
feature_dimensions: Optional[List[str]] = None,
) -> str:
"""Identify potential areas for improvement"""
# This method could be expanded to include more sophisticated analysis
# For now, we'll use a simple approach

"""Identify improvement areas with proper fitness/feature separation"""

improvement_areas = []

# Check program length
# Support both old and new parameter names for backward compatibility
feature_dimensions = feature_dimensions or []

# Calculate fitness (excluding feature dimensions)
current_fitness = get_fitness_score(metrics, feature_dimensions)

# Track fitness changes (not individual metrics)
if previous_programs:
prev_metrics = previous_programs[-1].get("metrics", {})
prev_fitness = get_fitness_score(prev_metrics, feature_dimensions)

if current_fitness > prev_fitness:
msg = self.template_manager.get_fragment(
"fitness_improved",
prev=prev_fitness,
current=current_fitness
)
improvement_areas.append(msg)
elif current_fitness < prev_fitness:
msg = self.template_manager.get_fragment(
"fitness_declined",
prev=prev_fitness,
current=current_fitness
)
improvement_areas.append(msg)
elif abs(current_fitness - prev_fitness) < 1e-6: # Essentially unchanged
msg = self.template_manager.get_fragment(
"fitness_stable",
current=current_fitness
)
improvement_areas.append(msg)

# Note feature exploration (not good/bad, just informational)
if feature_dimensions:
feature_coords = format_feature_coordinates(metrics, feature_dimensions)
if feature_coords != "No feature coordinates":
msg = self.template_manager.get_fragment(
"exploring_region",
features=feature_coords
)
improvement_areas.append(msg)

# Code length check (configurable threshold)
threshold = (
self.config.suggest_simplification_after_chars or self.config.code_length_threshold
)
if threshold and len(current_program) > threshold:
improvement_areas.append(
"Consider simplifying the code to improve readability and maintainability"
msg = self.template_manager.get_fragment(
"code_too_long",
threshold=threshold
)

# Check for performance patterns in previous attempts
if len(previous_programs) >= 2:
recent_attempts = previous_programs[-2:]
metrics_improved = []
metrics_regressed = []

for metric, value in metrics.items():
# Only compare numeric metrics
if not isinstance(value, (int, float)) or isinstance(value, bool):
continue

improved = True
regressed = True

for attempt in recent_attempts:
attempt_value = attempt["metrics"].get(metric, 0)
# Only compare if both values are numeric
if isinstance(value, (int, float)) and isinstance(attempt_value, (int, float)):
if attempt_value <= value:
regressed = False
if attempt_value >= value:
improved = False
else:
# If either value is non-numeric, skip comparison
improved = False
regressed = False

if improved and metric not in metrics_improved:
metrics_improved.append(metric)
if regressed and metric not in metrics_regressed:
metrics_regressed.append(metric)

if metrics_improved:
improvement_areas.append(
f"Metrics showing improvement: {', '.join(metrics_improved)}. "
"Consider continuing with similar changes."
)

if metrics_regressed:
improvement_areas.append(
f"Metrics showing regression: {', '.join(metrics_regressed)}. "
"Consider reverting or revising recent changes in these areas."
)

# If we don't have specific improvements to suggest
improvement_areas.append(msg)

# Default guidance if nothing specific
if not improvement_areas:
improvement_areas.append(
"Focus on optimizing the code for better performance on the target metrics"
self.template_manager.get_fragment("no_specific_guidance")
)

return "\n".join([f"- {area}" for area in improvement_areas])
return "\n".join(f"- {area}" for area in improvement_areas)

def _format_evolution_history(
self,
Expand Down
Loading