diff --git a/configs/default_config.yaml b/configs/default_config.yaml index 04181b056..fc512c26c 100644 --- a/configs/default_config.yaml +++ b/configs/default_config.yaml @@ -109,7 +109,7 @@ database: # - "diversity": Code structure diversity # # You can mix built-in features with custom metrics from your evaluator: - feature_dimensions: # Dimensions for MAP-Elites feature map + feature_dimensions: # Dimensions for MAP-Elites feature map (for diversity, NOT fitness) - "complexity" # Code length (built-in) - "diversity" # Code diversity (built-in) # Example with custom features: @@ -131,6 +131,9 @@ database: # Evaluator configuration evaluator: + # Fitness calculation: Uses 'combined_score' if available, otherwise averages + # all metrics EXCEPT those listed in database.feature_dimensions + # General settings timeout: 300 # Maximum evaluation time in seconds max_retries: 3 # Maximum number of retries for evaluation diff --git a/openevolve/_version.py b/openevolve/_version.py index 3e5f65b3e..237e778a8 100644 --- a/openevolve/_version.py +++ b/openevolve/_version.py @@ -1,3 +1,3 @@ """Version information for openevolve package.""" -__version__ = "0.1.3" +__version__ = "0.2.0" diff --git a/openevolve/database.py b/openevolve/database.py index 308233801..fa4ef05b9 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -19,7 +19,7 @@ from openevolve.config import DatabaseConfig from openevolve.utils.code_utils import calculate_edit_distance -from openevolve.utils.metrics_utils import safe_numeric_average +from openevolve.utils.metrics_utils import safe_numeric_average, get_fitness_score logger = logging.getLogger(__name__) @@ -247,8 +247,8 @@ def add( existing_program_id = self.feature_map[feature_key] if existing_program_id in self.programs: existing_program = self.programs[existing_program_id] - new_fitness = safe_numeric_average(program.metrics) - existing_fitness = safe_numeric_average(existing_program.metrics) + new_fitness = get_fitness_score(program.metrics, self.config.feature_dimensions) + existing_fitness = get_fitness_score(existing_program.metrics, self.config.feature_dimensions) logger.info( "MAP-Elites cell improved: %s (fitness: %.3f -> %.3f)", coords_dict, @@ -358,22 +358,15 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]: ) if sorted_programs: logger.debug(f"Found best program by metric '{metric}': {sorted_programs[0].id}") - elif self.programs and all("combined_score" in p.metrics for p in self.programs.values()): - # Sort by combined_score if it exists (preferred method) - sorted_programs = sorted( - self.programs.values(), key=lambda p: p.metrics["combined_score"], reverse=True - ) - if sorted_programs: - logger.debug(f"Found best program by combined_score: {sorted_programs[0].id}") else: - # Sort by average of all numeric metrics as fallback + # Sort by fitness (excluding feature dimensions) sorted_programs = sorted( self.programs.values(), - key=lambda p: safe_numeric_average(p.metrics), + key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions), reverse=True, ) if sorted_programs: - logger.debug(f"Found best program by average metrics: {sorted_programs[0].id}") + logger.debug(f"Found best program by fitness score: {sorted_programs[0].id}") # Update the best program tracking if we found a better program if sorted_programs and ( @@ -444,7 +437,7 @@ def get_top_programs( # Sort by combined_score if available, otherwise by average of all numeric metrics sorted_programs = sorted( candidates, - key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)), + key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions), reverse=True, ) @@ -718,7 +711,8 @@ def _calculate_feature_coords(self, program: Program) -> List[int]: if not program.metrics: bin_idx = 0 else: - avg_score = safe_numeric_average(program.metrics) + # Use fitness score for "score" dimension (consistent with rest of system) + avg_score = get_fitness_score(program.metrics, self.config.feature_dimensions) # Update stats and scale self._update_feature_stats("score", avg_score) scaled_value = self._scale_feature_value("score", avg_score) @@ -818,7 +812,10 @@ def _feature_coords_to_key(self, coords: List[int]) -> str: def _is_better(self, program1: Program, program2: Program) -> bool: """ - Determine if program1 is better than program2 + Determine if program1 has better FITNESS than program2 + + Uses fitness calculation that excludes MAP-Elites feature dimensions + to prevent pollution of fitness comparisons. Args: program1: First program @@ -837,15 +834,11 @@ def _is_better(self, program1: Program, program2: Program) -> bool: if not program1.metrics and program2.metrics: return False - # Check for combined_score first (this is the preferred metric) - if "combined_score" in program1.metrics and "combined_score" in program2.metrics: - return program1.metrics["combined_score"] > program2.metrics["combined_score"] - - # Fallback to average of all numeric metrics - avg1 = safe_numeric_average(program1.metrics) - avg2 = safe_numeric_average(program2.metrics) + # Compare fitness (excluding feature dimensions) + fitness1 = get_fitness_score(program1.metrics, self.config.feature_dimensions) + fitness2 = get_fitness_score(program2.metrics, self.config.feature_dimensions) - return avg1 > avg2 + return fitness1 > fitness2 def _update_archive(self, program: Program) -> None: """ @@ -882,7 +875,7 @@ def _update_archive(self, program: Program) -> None: # Find worst program among valid programs if valid_archive_programs: worst_program = min( - valid_archive_programs, key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)) + valid_archive_programs, key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions) ) # Replace if new program is better @@ -1287,7 +1280,7 @@ def _enforce_population_limit(self, exclude_program_id: Optional[str] = None) -> # Sort by combined_score if available, otherwise by average metric (worst first) sorted_programs = sorted( all_programs, - key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)), + key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions), ) # Remove worst programs, but never remove the best program or excluded program @@ -1387,7 +1380,7 @@ def migrate_programs(self) -> None: # Sort by fitness (using combined_score or average metrics) island_programs.sort( - key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)), + key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions), reverse=True, ) @@ -1558,7 +1551,7 @@ def get_island_stats(self) -> List[dict]: if island_programs: scores = [ - p.metrics.get("combined_score", safe_numeric_average(p.metrics)) + get_fitness_score(p.metrics, self.config.feature_dimensions) for p in island_programs ] diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index 0e25470b0..6cfc00fa4 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -563,8 +563,9 @@ async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[s try: # Create prompt for LLM + feature_dimensions = self.database.config.feature_dimensions if self.database else [] prompt = self.prompt_sampler.build_prompt( - current_program=program_code, template_key="evaluation" + current_program=program_code, template_key="evaluation", feature_dimensions=feature_dimensions ) # Get LLM response diff --git a/openevolve/iteration.py b/openevolve/iteration.py index df64a5508..2e3509dfa 100644 --- a/openevolve/iteration.py +++ b/openevolve/iteration.py @@ -70,6 +70,7 @@ async def run_iteration_with_shared_db( evolution_round=iteration, diff_based_evolution=config.diff_based_evolution, program_artifacts=parent_artifacts if parent_artifacts else None, + feature_dimensions=database.config.feature_dimensions, ) result = Result(parent=parent) diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py index c9ffba051..5c01b8b40 100644 --- a/openevolve/process_parallel.py +++ b/openevolve/process_parallel.py @@ -172,6 +172,7 @@ def _run_iteration_worker( evolution_round=iteration, diff_based_evolution=_worker_config.diff_based_evolution, program_artifacts=parent_artifacts, + feature_dimensions=db_snapshot.get("feature_dimensions", []), ) iteration_start = time.time() @@ -349,6 +350,7 @@ def _create_database_snapshot(self) -> Dict[str, Any]: "programs": {pid: prog.to_dict() for pid, prog in self.database.programs.items()}, "islands": [list(island) for island in self.database.islands], "current_island": self.database.current_island, + "feature_dimensions": self.database.config.feature_dimensions, "artifacts": {}, # Will be populated selectively } diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py index 62d78f796..8ad1c5241 100644 --- a/openevolve/prompt/sampler.py +++ b/openevolve/prompt/sampler.py @@ -9,7 +9,7 @@ from openevolve.config import PromptConfig from openevolve.prompt.templates import TemplateManager from openevolve.utils.format_utils import format_metrics_safe -from openevolve.utils.metrics_utils import safe_numeric_average +from openevolve.utils.metrics_utils import safe_numeric_average, get_fitness_score, format_feature_coordinates logger = logging.getLogger(__name__) @@ -19,7 +19,7 @@ class PromptSampler: def __init__(self, config: PromptConfig): self.config = config - self.template_manager = TemplateManager(config.template_dir) + self.template_manager = TemplateManager(custom_template_dir=config.template_dir) # Initialize the random number generator random.seed() @@ -60,6 +60,7 @@ def build_prompt( diff_based_evolution: bool = True, template_key: Optional[str] = None, program_artifacts: Optional[Dict[str, Union[str, bytes]]] = None, + feature_dimensions: Optional[List[str]] = None, **kwargs: Any, ) -> Dict[str, str]: """ @@ -110,7 +111,7 @@ def build_prompt( # Identify areas for improvement improvement_areas = self._identify_improvement_areas( - current_program, parent_program, program_metrics, previous_programs + current_program, parent_program, program_metrics, previous_programs, feature_dimensions ) # Format evolution history @@ -127,9 +128,17 @@ def build_prompt( if self.config.use_template_stochasticity: user_template = self._apply_template_variations(user_template) + # Calculate fitness and feature coordinates for the new template format + feature_dimensions = feature_dimensions or [] + fitness_score = get_fitness_score(program_metrics, feature_dimensions) + feature_coords = format_feature_coordinates(program_metrics, feature_dimensions) + # Format the final user message user_message = user_template.format( metrics=metrics_str, + fitness_score=f"{fitness_score:.4f}", + feature_coords=feature_coords, + feature_dimensions=", ".join(feature_dimensions) if feature_dimensions else "None", improvement_areas=improvement_areas, evolution_history=evolution_history, current_program=current_program, @@ -163,74 +172,70 @@ def _identify_improvement_areas( parent_program: str, metrics: Dict[str, float], previous_programs: List[Dict[str, Any]], + feature_dimensions: Optional[List[str]] = None, ) -> str: - """Identify potential areas for improvement""" - # This method could be expanded to include more sophisticated analysis - # For now, we'll use a simple approach - + """Identify improvement areas with proper fitness/feature separation""" + improvement_areas = [] - - # Check program length - # Support both old and new parameter names for backward compatibility + feature_dimensions = feature_dimensions or [] + + # Calculate fitness (excluding feature dimensions) + current_fitness = get_fitness_score(metrics, feature_dimensions) + + # Track fitness changes (not individual metrics) + if previous_programs: + prev_metrics = previous_programs[-1].get("metrics", {}) + prev_fitness = get_fitness_score(prev_metrics, feature_dimensions) + + if current_fitness > prev_fitness: + msg = self.template_manager.get_fragment( + "fitness_improved", + prev=prev_fitness, + current=current_fitness + ) + improvement_areas.append(msg) + elif current_fitness < prev_fitness: + msg = self.template_manager.get_fragment( + "fitness_declined", + prev=prev_fitness, + current=current_fitness + ) + improvement_areas.append(msg) + elif abs(current_fitness - prev_fitness) < 1e-6: # Essentially unchanged + msg = self.template_manager.get_fragment( + "fitness_stable", + current=current_fitness + ) + improvement_areas.append(msg) + + # Note feature exploration (not good/bad, just informational) + if feature_dimensions: + feature_coords = format_feature_coordinates(metrics, feature_dimensions) + if feature_coords != "No feature coordinates": + msg = self.template_manager.get_fragment( + "exploring_region", + features=feature_coords + ) + improvement_areas.append(msg) + + # Code length check (configurable threshold) threshold = ( self.config.suggest_simplification_after_chars or self.config.code_length_threshold ) if threshold and len(current_program) > threshold: - improvement_areas.append( - "Consider simplifying the code to improve readability and maintainability" + msg = self.template_manager.get_fragment( + "code_too_long", + threshold=threshold ) - - # Check for performance patterns in previous attempts - if len(previous_programs) >= 2: - recent_attempts = previous_programs[-2:] - metrics_improved = [] - metrics_regressed = [] - - for metric, value in metrics.items(): - # Only compare numeric metrics - if not isinstance(value, (int, float)) or isinstance(value, bool): - continue - - improved = True - regressed = True - - for attempt in recent_attempts: - attempt_value = attempt["metrics"].get(metric, 0) - # Only compare if both values are numeric - if isinstance(value, (int, float)) and isinstance(attempt_value, (int, float)): - if attempt_value <= value: - regressed = False - if attempt_value >= value: - improved = False - else: - # If either value is non-numeric, skip comparison - improved = False - regressed = False - - if improved and metric not in metrics_improved: - metrics_improved.append(metric) - if regressed and metric not in metrics_regressed: - metrics_regressed.append(metric) - - if metrics_improved: - improvement_areas.append( - f"Metrics showing improvement: {', '.join(metrics_improved)}. " - "Consider continuing with similar changes." - ) - - if metrics_regressed: - improvement_areas.append( - f"Metrics showing regression: {', '.join(metrics_regressed)}. " - "Consider reverting or revising recent changes in these areas." - ) - - # If we don't have specific improvements to suggest + improvement_areas.append(msg) + + # Default guidance if nothing specific if not improvement_areas: improvement_areas.append( - "Focus on optimizing the code for better performance on the target metrics" + self.template_manager.get_fragment("no_specific_guidance") ) - - return "\n".join([f"- {area}" for area in improvement_areas]) + + return "\n".join(f"- {area}" for area in improvement_areas) def _format_evolution_history( self, diff --git a/openevolve/prompt/templates.py b/openevolve/prompt/templates.py index bbb95e196..9ff03240b 100644 --- a/openevolve/prompt/templates.py +++ b/openevolve/prompt/templates.py @@ -3,8 +3,9 @@ """ import os +import json from pathlib import Path -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, Any # Base system message template for evolution BASE_SYSTEM_TEMPLATE = """You are an expert software developer tasked with iteratively improving a codebase. @@ -169,28 +170,81 @@ class TemplateManager: - """Manages templates for prompt generation""" - - def __init__(self, template_dir: Optional[str] = None): - self.templates = DEFAULT_TEMPLATES.copy() - - # Load templates from directory if provided - if template_dir and os.path.isdir(template_dir): - self._load_templates_from_dir(template_dir) - - def _load_templates_from_dir(self, template_dir: str) -> None: - """Load templates from a directory""" - for file_path in Path(template_dir).glob("*.txt"): - template_name = file_path.stem - with open(file_path, "r") as f: + """Manages templates with cascading override support""" + + def __init__(self, custom_template_dir: Optional[str] = None): + # Get default template directory + self.default_dir = Path(__file__).parent.parent / "prompts" / "defaults" + self.custom_dir = Path(custom_template_dir) if custom_template_dir else None + + # Load templates with cascading priority + self.templates = {} + self.fragments = {} + + # 1. Load defaults + self._load_from_directory(self.default_dir) + + # 2. Override with custom templates (if provided) + if self.custom_dir and self.custom_dir.exists(): + self._load_from_directory(self.custom_dir) + + # 3. Minimal hardcoded fallbacks (for safety/compatibility) + self._ensure_minimal_templates() + + def _load_from_directory(self, directory: Path) -> None: + """Load all templates and fragments from a directory""" + if not directory.exists(): + return + + # Load .txt templates + for txt_file in directory.glob("*.txt"): + template_name = txt_file.stem + with open(txt_file, 'r') as f: self.templates[template_name] = f.read() - - def get_template(self, template_name: str) -> str: + + # Load fragments.json if exists + fragments_file = directory / "fragments.json" + if fragments_file.exists(): + with open(fragments_file, 'r') as f: + loaded_fragments = json.load(f) + self.fragments.update(loaded_fragments) + + def _ensure_minimal_templates(self) -> None: + """Ensure critical templates exist (backward compatibility)""" + if "system_message" not in self.templates: + self.templates["system_message"] = "You are an AI assistant helping with code evolution." + if "diff_user" not in self.templates: + self.templates["diff_user"] = "# Task\nImprove the program:\n```\n{current_program}\n```" + if "full_rewrite_user" not in self.templates: + self.templates["full_rewrite_user"] = "# Task\nRewrite the program:\n```\n{current_program}\n```" + + # Ensure critical fragments exist + if "fitness_improved" not in self.fragments: + self.fragments["fitness_improved"] = "Fitness improved: {prev} → {current}" + if "fitness_declined" not in self.fragments: + self.fragments["fitness_declined"] = "Fitness declined: {prev} → {current}" + if "no_specific_guidance" not in self.fragments: + self.fragments["no_specific_guidance"] = "Focus on improving fitness while maintaining diversity" + + def get_template(self, name: str) -> str: """Get a template by name""" - if template_name not in self.templates: - raise ValueError(f"Template '{template_name}' not found") - return self.templates[template_name] - + if name not in self.templates: + raise ValueError(f"Template '{name}' not found") + return self.templates[name] + + def get_fragment(self, name: str, **kwargs) -> str: + """Get and format a fragment""" + if name not in self.fragments: + return f"[Missing fragment: {name}]" + try: + return self.fragments[name].format(**kwargs) + except KeyError as e: + return f"[Fragment formatting error: {e}]" + def add_template(self, template_name: str, template: str) -> None: """Add or update a template""" self.templates[template_name] = template + + def add_fragment(self, fragment_name: str, fragment: str) -> None: + """Add or update a fragment""" + self.fragments[fragment_name] = fragment diff --git a/openevolve/prompts/defaults/diff_user.txt b/openevolve/prompts/defaults/diff_user.txt new file mode 100644 index 000000000..d7ac189dd --- /dev/null +++ b/openevolve/prompts/defaults/diff_user.txt @@ -0,0 +1,46 @@ +# Current Program Information +- Fitness: {fitness_score} +- Feature coordinates: {feature_coords} +- Focus areas: {improvement_areas} + +{artifacts} + +# Program Evolution History +{evolution_history} + +# Current Program +```{language} +{current_program} +``` + +# Task +Suggest improvements to the program that will improve its FITNESS SCORE. +The system maintains diversity across these dimensions: {feature_dimensions} +Different solutions with similar fitness but different features are valuable. + +You MUST use the exact SEARCH/REPLACE diff format shown below to indicate changes: + +<<<<<<< SEARCH +# Original code to find and replace (must match exactly) +======= +# New replacement code +>>>>>>> REPLACE + +Example of valid diff format: +<<<<<<< SEARCH +for i in range(m): + for j in range(p): + for k in range(n): + C[i, j] += A[i, k] * B[k, j] +======= +# Reorder loops for better memory access pattern +for i in range(m): + for k in range(n): + for j in range(p): + C[i, j] += A[i, k] * B[k, j] +>>>>>>> REPLACE + +You can suggest multiple changes. Each SEARCH section must exactly match code in the current program. +Be thoughtful about your changes and explain your reasoning thoroughly. + +IMPORTANT: Do not rewrite the entire program - focus on targeted improvements. \ No newline at end of file diff --git a/openevolve/prompts/defaults/evaluation.txt b/openevolve/prompts/defaults/evaluation.txt new file mode 100644 index 000000000..e9c9dd240 --- /dev/null +++ b/openevolve/prompts/defaults/evaluation.txt @@ -0,0 +1,19 @@ +Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics: +1. Readability: How easy is the code to read and understand? +2. Maintainability: How easy would the code be to maintain and modify? +3. Efficiency: How efficient is the code in terms of time and space complexity? + +For each metric, provide a score between 0.0 and 1.0, where 1.0 is best. + +Code to evaluate: +```python +{current_program} +``` + +Return your evaluation as a JSON object with the following format: +{{ + "readability": [score], + "maintainability": [score], + "efficiency": [score], + "reasoning": "[brief explanation of scores]" +}} \ No newline at end of file diff --git a/openevolve/prompts/defaults/evaluator_system_message.txt b/openevolve/prompts/defaults/evaluator_system_message.txt new file mode 100644 index 000000000..e85320fe1 --- /dev/null +++ b/openevolve/prompts/defaults/evaluator_system_message.txt @@ -0,0 +1,2 @@ +You are an expert code reviewer. +Your job is to analyze the provided code and evaluate it systematically. \ No newline at end of file diff --git a/openevolve/prompts/defaults/evolution_history.txt b/openevolve/prompts/defaults/evolution_history.txt new file mode 100644 index 000000000..aa55226e4 --- /dev/null +++ b/openevolve/prompts/defaults/evolution_history.txt @@ -0,0 +1,9 @@ +## Previous Attempts + +{previous_attempts} + +## Top Performing Programs + +{top_programs} + +{inspirations_section} \ No newline at end of file diff --git a/openevolve/prompts/defaults/fragments.json b/openevolve/prompts/defaults/fragments.json new file mode 100644 index 000000000..47b6bb4b1 --- /dev/null +++ b/openevolve/prompts/defaults/fragments.json @@ -0,0 +1,18 @@ +{ + "fitness_improved": "Fitness improved: {prev:.4f} → {current:.4f}", + "fitness_declined": "Fitness declined: {prev:.4f} → {current:.4f}. Consider revising recent changes.", + "fitness_stable": "Fitness unchanged at {current:.4f}", + "exploring_region": "Exploring {features} region of solution space", + "metrics_label": "Metrics: {metrics}", + "outcome_all_improved": "All metrics improved", + "outcome_all_regressed": "All metrics regressed", + "outcome_mixed": "Mixed results", + "outcome_fitness_improved": "Fitness improved (exploring new features)", + "key_features_prefix": "Strong in", + "code_too_long": "Consider simplifying - code length exceeds {threshold} characters", + "no_specific_guidance": "Focus on improving fitness while maintaining diversity", + "metrics_improved": "Metrics showing improvement: {metrics}. Consider continuing with similar approaches.", + "metrics_regressed": "Metrics showing changes: {metrics}. Consider different approaches in these areas.", + "code_simplification": "Consider simplifying the code to improve readability and maintainability", + "default_improvement": "Focus on improving the fitness score while exploring diverse solutions" +} \ No newline at end of file diff --git a/openevolve/prompts/defaults/full_rewrite_user.txt b/openevolve/prompts/defaults/full_rewrite_user.txt new file mode 100644 index 000000000..324931e45 --- /dev/null +++ b/openevolve/prompts/defaults/full_rewrite_user.txt @@ -0,0 +1,27 @@ +# Current Program Information +- Fitness: {fitness_score} +- Feature coordinates: {feature_coords} +- Focus areas: {improvement_areas} + +{artifacts} + +# Program Evolution History +{evolution_history} + +# Current Program +```{language} +{current_program} +``` + +# Task +Rewrite the program to improve its FITNESS SCORE. +The system maintains diversity across these dimensions: {feature_dimensions} +Different solutions with similar fitness but different features are valuable. +Provide the complete new program code. + +IMPORTANT: Make sure your rewritten program maintains the same inputs and outputs +as the original program, but with improved internal implementation. + +```{language} +# Your rewritten program here +``` \ No newline at end of file diff --git a/openevolve/prompts/defaults/inspiration_program.txt b/openevolve/prompts/defaults/inspiration_program.txt new file mode 100644 index 000000000..77b3c4790 --- /dev/null +++ b/openevolve/prompts/defaults/inspiration_program.txt @@ -0,0 +1,5 @@ +### Inspiration {program_number} (Score: {score}, Type: {program_type}) +```{language} +{program_snippet} +``` +Unique approach: {unique_features} \ No newline at end of file diff --git a/openevolve/prompts/defaults/inspirations_section.txt b/openevolve/prompts/defaults/inspirations_section.txt new file mode 100644 index 000000000..da6adf43e --- /dev/null +++ b/openevolve/prompts/defaults/inspirations_section.txt @@ -0,0 +1,5 @@ +## Inspiration Programs + +These programs represent diverse approaches and creative solutions that may inspire new ideas: + +{inspiration_programs} \ No newline at end of file diff --git a/openevolve/prompts/defaults/previous_attempt.txt b/openevolve/prompts/defaults/previous_attempt.txt new file mode 100644 index 000000000..3d944c148 --- /dev/null +++ b/openevolve/prompts/defaults/previous_attempt.txt @@ -0,0 +1,4 @@ +### Attempt {attempt_number} +- Changes: {changes} +- Metrics: {performance} +- Outcome: {outcome} \ No newline at end of file diff --git a/openevolve/prompts/defaults/system_message.txt b/openevolve/prompts/defaults/system_message.txt new file mode 100644 index 000000000..65f2a589a --- /dev/null +++ b/openevolve/prompts/defaults/system_message.txt @@ -0,0 +1,3 @@ +You are an expert software developer tasked with iteratively improving a codebase. +Your goal is to maximize the FITNESS SCORE while exploring diverse solutions across feature dimensions. +The system maintains a collection of diverse programs - both high fitness AND diversity are valuable. \ No newline at end of file diff --git a/openevolve/prompts/defaults/top_program.txt b/openevolve/prompts/defaults/top_program.txt new file mode 100644 index 000000000..34764292e --- /dev/null +++ b/openevolve/prompts/defaults/top_program.txt @@ -0,0 +1,5 @@ +### Program {program_number} (Score: {score}) +```{language} +{program_snippet} +``` +Key features: {key_features} \ No newline at end of file diff --git a/openevolve/utils/metrics_utils.py b/openevolve/utils/metrics_utils.py index f2da61f49..e21567ba1 100644 --- a/openevolve/utils/metrics_utils.py +++ b/openevolve/utils/metrics_utils.py @@ -2,7 +2,7 @@ Safe calculation utilities for metrics containing mixed types """ -from typing import Any, Dict +from typing import Any, Dict, List, Optional def safe_numeric_average(metrics: Dict[str, Any]) -> float: @@ -64,3 +64,86 @@ def safe_numeric_sum(metrics: Dict[str, Any]) -> float: continue return numeric_sum + + +def get_fitness_score( + metrics: Dict[str, Any], + feature_dimensions: Optional[List[str]] = None +) -> float: + """ + Calculate fitness score, excluding MAP-Elites feature dimensions + + This ensures that MAP-Elites features don't pollute the fitness calculation + when combined_score is not available. + + Args: + metrics: All metrics from evaluation + feature_dimensions: List of MAP-Elites dimensions to exclude from fitness + + Returns: + Fitness score (combined_score if available, otherwise average of non-feature metrics) + """ + if not metrics: + return 0.0 + + # Always prefer combined_score if available + if "combined_score" in metrics: + try: + return float(metrics["combined_score"]) + except (ValueError, TypeError): + pass + + # Otherwise, average only non-feature metrics + feature_dimensions = feature_dimensions or [] + fitness_metrics = {} + + for key, value in metrics.items(): + # Exclude MAP feature dimensions from fitness calculation + if key not in feature_dimensions: + if isinstance(value, (int, float)): + try: + float_val = float(value) + if not (float_val != float_val): # Check for NaN + fitness_metrics[key] = float_val + except (ValueError, TypeError, OverflowError): + continue + + # If no non-feature metrics, fall back to all metrics (backward compatibility) + if not fitness_metrics: + return safe_numeric_average(metrics) + + return safe_numeric_average(fitness_metrics) + + +def format_feature_coordinates( + metrics: Dict[str, Any], + feature_dimensions: List[str] +) -> str: + """ + Format feature coordinates for display in prompts + + Args: + metrics: All metrics from evaluation + feature_dimensions: List of MAP-Elites feature dimensions + + Returns: + Formatted string showing feature coordinates + """ + feature_values = [] + for dim in feature_dimensions: + if dim in metrics: + value = metrics[dim] + if isinstance(value, (int, float)): + try: + float_val = float(value) + if not (float_val != float_val): # Check for NaN + feature_values.append(f"{dim}={float_val:.2f}") + except (ValueError, TypeError, OverflowError): + feature_values.append(f"{dim}={value}") + else: + feature_values.append(f"{dim}={value}") + + if not feature_values: + return "No feature coordinates" + + return ", ".join(feature_values) diff --git a/tests/test_prompt_sampler.py b/tests/test_prompt_sampler.py index 771a962b7..fc29e9730 100644 --- a/tests/test_prompt_sampler.py +++ b/tests/test_prompt_sampler.py @@ -46,7 +46,8 @@ def test_build_prompt(self): self.assertIn("system", prompt) self.assertIn("user", prompt) self.assertIn("def test(): pass", prompt["user"]) - self.assertIn("score: 0.5", prompt["user"]) + # Check that the score value appears in the prompt (either as fitness or metrics) + self.assertIn("0.5", prompt["user"]) if __name__ == "__main__":