diff --git a/examples/rust_adaptive_sort/config.yaml b/examples/rust_adaptive_sort/config.yaml index 0f5649d5f..497942891 100644 --- a/examples/rust_adaptive_sort/config.yaml +++ b/examples/rust_adaptive_sort/config.yaml @@ -49,9 +49,5 @@ evaluator: timeout: 60 # Rust compilation can take time parallel_evaluations: 3 - # Use cascade evaluation for performance testing - cascade_evaluation: true - cascade_thresholds: - - 0.5 # Compilation success and basic correctness - - 0.7 # Good performance - - 0.85 # Excellent adaptability \ No newline at end of file + # Direct evaluation - evaluator doesn't implement cascade functions + cascade_evaluation: false \ No newline at end of file diff --git a/openevolve/database.py b/openevolve/database.py index 8aba55283..253b66fd5 100644 --- a/openevolve/database.py +++ b/openevolve/database.py @@ -122,6 +122,9 @@ def __init__(self, config: DatabaseConfig): # Track the absolute best program separately self.best_program_id: Optional[str] = None + + # Track best program per island for proper island-based evolution + self.island_best_programs: List[Optional[str]] = [None] * config.num_islands # Track the last iteration number (for resuming) self.last_iteration: int = 0 @@ -186,6 +189,28 @@ def add( should_replace = self._is_better(program, self.programs[existing_program_id]) if should_replace: + # Log significant MAP-Elites events + coords_dict = {self.config.feature_dimensions[i]: feature_coords[i] for i in range(len(feature_coords))} + + if feature_key not in self.feature_map: + # New cell occupation + logging.info("New MAP-Elites cell occupied: %s", coords_dict) + # Check coverage milestone + total_possible_cells = self.feature_bins ** len(self.config.feature_dimensions) + coverage = (len(self.feature_map) + 1) / total_possible_cells + if coverage in [0.1, 0.25, 0.5, 0.75, 0.9]: + logging.info("MAP-Elites coverage reached %.1f%% (%d/%d cells)", + coverage * 100, len(self.feature_map) + 1, total_possible_cells) + else: + # Cell replacement - existing program being replaced + existing_program_id = self.feature_map[feature_key] + if existing_program_id in self.programs: + existing_program = self.programs[existing_program_id] + new_fitness = safe_numeric_average(program.metrics) + existing_fitness = safe_numeric_average(existing_program.metrics) + logging.info("MAP-Elites cell improved: %s (fitness: %.3f -> %.3f)", + coords_dict, existing_fitness, new_fitness) + self.feature_map[feature_key] = program.id # Add to specific island (not random!) @@ -205,6 +230,9 @@ def add( # Update the absolute best program tracking (after population enforcement) self._update_best_program(program) + + # Update island-specific best program tracking + self._update_island_best_program(program, island_idx) # Save to disk if configured if self.config.db_path: @@ -315,13 +343,14 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]: return sorted_programs[0] if sorted_programs else None - def get_top_programs(self, n: int = 10, metric: Optional[str] = None) -> List[Program]: + def get_top_programs(self, n: int = 10, metric: Optional[str] = None, island_idx: Optional[int] = None) -> List[Program]: """ Get the top N programs based on a metric Args: n: Number of programs to return metric: Metric to use for ranking (uses average if None) + island_idx: If specified, only return programs from this island Returns: List of top programs @@ -329,17 +358,32 @@ def get_top_programs(self, n: int = 10, metric: Optional[str] = None) -> List[Pr if not self.programs: return [] + # Get candidate programs + if island_idx is not None: + # Island-specific query + island_programs = [ + self.programs[pid] for pid in self.islands[island_idx] + if pid in self.programs + ] + candidates = island_programs + else: + # Global query + candidates = list(self.programs.values()) + + if not candidates: + return [] + if metric: # Sort by specific metric sorted_programs = sorted( - [p for p in self.programs.values() if metric in p.metrics], + [p for p in candidates if metric in p.metrics], key=lambda p: p.metrics[metric], reverse=True, ) else: # Sort by average of all numeric metrics sorted_programs = sorted( - self.programs.values(), + candidates, key=lambda p: safe_numeric_average(p.metrics), reverse=True, ) @@ -379,6 +423,7 @@ def save(self, path: Optional[str] = None, iteration: int = 0) -> None: "islands": [list(island) for island in self.islands], "archive": list(self.archive), "best_program_id": self.best_program_id, + "island_best_programs": self.island_best_programs, "last_iteration": iteration or self.last_iteration, "current_island": self.current_island, "island_generations": self.island_generations, @@ -412,6 +457,7 @@ def load(self, path: str) -> None: saved_islands = metadata.get("islands", []) self.archive = set(metadata.get("archive", [])) self.best_program_id = metadata.get("best_program_id") + self.island_best_programs = metadata.get("island_best_programs", [None] * len(saved_islands)) self.last_iteration = metadata.get("last_iteration", 0) self.current_island = metadata.get("current_island", 0) self.island_generations = metadata.get("island_generations", [0] * len(saved_islands)) @@ -440,6 +486,10 @@ def load(self, path: str) -> None: # Ensure island_generations list has correct length if len(self.island_generations) != len(self.islands): self.island_generations = [0] * len(self.islands) + + # Ensure island_best_programs list has correct length + if len(self.island_best_programs) != len(self.islands): + self.island_best_programs = [None] * len(self.islands) logger.info(f"Loaded database with {len(self.programs)} programs from {path}") @@ -487,6 +537,9 @@ def _reconstruct_islands(self, saved_islands: List[List[str]]) -> None: feature_keys_to_remove.append(key) for key in feature_keys_to_remove: del self.feature_map[key] + + # Clean up island best programs - remove stale references + self._cleanup_stale_island_bests() # Check best program if self.best_program_id and self.best_program_id not in self.programs: @@ -613,7 +666,8 @@ def _calculate_feature_coords(self, program: Program) -> List[int]: else: # Default to middle bin if feature not found coords.append(self.feature_bins // 2) - logging.info( + # Only log coordinates at debug level for troubleshooting + logging.debug( "MAP-Elites coords: %s", str({self.config.feature_dimensions[i]: coords[i] for i in range(len(coords))}), ) @@ -748,6 +802,53 @@ def _update_best_program(self, program: Program) -> None: else: logger.info(f"New best program {program.id} replaces {old_id}") + def _update_island_best_program(self, program: Program, island_idx: int) -> None: + """ + Update the best program tracking for a specific island + + Args: + program: Program to consider as the new best for the island + island_idx: Island index + """ + # Ensure island_idx is valid + if island_idx >= len(self.island_best_programs): + logger.warning(f"Invalid island index {island_idx}, skipping island best update") + return + + # If island doesn't have a best program yet, this becomes the best + current_island_best_id = self.island_best_programs[island_idx] + if current_island_best_id is None: + self.island_best_programs[island_idx] = program.id + logger.debug(f"Set initial best program for island {island_idx} to {program.id}") + return + + # Check if current best still exists + if current_island_best_id not in self.programs: + logger.warning( + f"Island {island_idx} best program {current_island_best_id} no longer exists, updating to {program.id}" + ) + self.island_best_programs[island_idx] = program.id + return + + current_island_best = self.programs[current_island_best_id] + + # Update if the new program is better + if self._is_better(program, current_island_best): + old_id = current_island_best_id + self.island_best_programs[island_idx] = program.id + + # Log the change + if "combined_score" in program.metrics and "combined_score" in current_island_best.metrics: + old_score = current_island_best.metrics["combined_score"] + new_score = program.metrics["combined_score"] + score_diff = new_score - old_score + logger.debug( + f"Island {island_idx}: New best program {program.id} replaces {old_id} " + f"(combined_score: {old_score:.4f} → {new_score:.4f}, +{score_diff:.4f})" + ) + else: + logger.debug(f"Island {island_idx}: New best program {program.id} replaces {old_id}") + def _sample_parent(self) -> Program: """ Sample a parent program from the current island for the next evolution step @@ -869,91 +970,124 @@ def _sample_random_parent(self) -> Program: def _sample_inspirations(self, parent: Program, n: int = 5) -> List[Program]: """ - Sample inspiration programs for the next evolution step + Sample inspiration programs for the next evolution step. + + For proper island-based evolution, inspirations are sampled ONLY from the + current island, maintaining genetic isolation between islands. Args: parent: Parent program n: Number of inspirations to sample Returns: - List of inspiration programs + List of inspiration programs from the current island """ inspirations = [] + + # Get the parent's island (should be current_island) + parent_island = parent.metadata.get("island", self.current_island) + + # Get all programs from the current island + island_program_ids = list(self.islands[parent_island]) + island_programs = [self.programs[pid] for pid in island_program_ids if pid in self.programs] + + if not island_programs: + logger.warning(f"Island {parent_island} has no programs for inspiration sampling") + return [] - # Always include the absolute best program if available and different from parent + # Include the island's best program if available and different from parent + island_best_id = self.island_best_programs[parent_island] if ( - self.best_program_id is not None - and self.best_program_id != parent.id - and self.best_program_id in self.programs + island_best_id is not None + and island_best_id != parent.id + and island_best_id in self.programs ): - best_program = self.programs[self.best_program_id] - inspirations.append(best_program) - logger.debug(f"Including best program {self.best_program_id} in inspirations") - elif self.best_program_id is not None and self.best_program_id not in self.programs: - # Clean up stale best program reference + island_best = self.programs[island_best_id] + inspirations.append(island_best) + logger.debug(f"Including island {parent_island} best program {island_best_id} in inspirations") + elif island_best_id is not None and island_best_id not in self.programs: + # Clean up stale island best reference logger.warning( - f"Best program {self.best_program_id} no longer exists, clearing reference" + f"Island {parent_island} best program {island_best_id} no longer exists, clearing reference" ) - self.best_program_id = None + self.island_best_programs[parent_island] = None - # Add top programs as inspirations + # Add top programs from the island as inspirations top_n = max(1, int(n * self.config.elite_selection_ratio)) - top_programs = self.get_top_programs(n=top_n) - for program in top_programs: + top_island_programs = self.get_top_programs(n=top_n, island_idx=parent_island) + for program in top_island_programs: if program.id not in [p.id for p in inspirations] and program.id != parent.id: inspirations.append(program) - # Add diverse programs using config.num_diverse_programs - if len(self.programs) > n and len(inspirations) < n: - # Calculate how many diverse programs to add (up to remaining slots) + # Add diverse programs from within the island + if len(island_programs) > n and len(inspirations) < n: remaining_slots = n - len(inspirations) - # Sample from different feature cells for diversity + # Try to sample from different feature cells within the island feature_coords = self._calculate_feature_coords(parent) - - # Get programs from nearby feature cells nearby_programs = [] - for _ in range(remaining_slots): + + # Create a mapping of feature cells to island programs for efficient lookup + island_feature_map = {} + for prog_id in island_program_ids: + if prog_id in self.programs: + prog = self.programs[prog_id] + prog_coords = self._calculate_feature_coords(prog) + cell_key = self._feature_coords_to_key(prog_coords) + island_feature_map[cell_key] = prog_id + + # Try to find programs from nearby feature cells within the island + for _ in range(remaining_slots * 3): # Try more times to find nearby programs # Perturb coordinates perturbed_coords = [ - max(0, min(self.feature_bins - 1, c + random.randint(-1, 1))) + max(0, min(self.feature_bins - 1, c + random.randint(-2, 2))) for c in feature_coords ] - - # Try to get program from this cell + cell_key = self._feature_coords_to_key(perturbed_coords) - if cell_key in self.feature_map: - program_id = self.feature_map[cell_key] - # Check if program still exists before adding + if cell_key in island_feature_map: + program_id = island_feature_map[cell_key] if ( program_id != parent.id and program_id not in [p.id for p in inspirations] + and program_id not in [p.id for p in nearby_programs] and program_id in self.programs ): nearby_programs.append(self.programs[program_id]) - elif program_id not in self.programs: - # Clean up stale reference in feature_map - logger.debug(f"Removing stale program {program_id} from feature_map") - del self.feature_map[cell_key] + if len(nearby_programs) >= remaining_slots: + break - # If we need more, add random programs + # If we still need more, add random programs from the island if len(inspirations) + len(nearby_programs) < n: remaining = n - len(inspirations) - len(nearby_programs) - all_ids = set(self.programs.keys()) + + # Get available programs from the island excluded_ids = ( {parent.id} .union(p.id for p in inspirations) .union(p.id for p in nearby_programs) ) - available_ids = list(all_ids - excluded_ids) - - if available_ids: - random_ids = random.sample(available_ids, min(remaining, len(available_ids))) + available_island_ids = [ + pid for pid in island_program_ids + if pid not in excluded_ids and pid in self.programs + ] + + if available_island_ids: + random_ids = random.sample( + available_island_ids, + min(remaining, len(available_island_ids)) + ) random_programs = [self.programs[pid] for pid in random_ids] nearby_programs.extend(random_programs) inspirations.extend(nearby_programs) + # Log island isolation info + logger.debug( + f"Sampled {len(inspirations)} inspirations from island {parent_island} " + f"(island has {len(island_programs)} programs total)" + ) + return inspirations[:n] def _enforce_population_limit(self, exclude_program_id: Optional[str] = None) -> None: @@ -1030,6 +1164,9 @@ def _enforce_population_limit(self, exclude_program_id: Optional[str] = None) -> logger.debug(f"Removed program {program_id} due to population limit") logger.info(f"Population size after cleanup: {len(self.programs)}") + + # Clean up any stale island best program references after removal + self._cleanup_stale_island_bests() # Island management methods def set_current_island(self, island_idx: int) -> None: @@ -1103,14 +1240,106 @@ def migrate_programs(self) -> None: # Add to target island self.islands[target_island].add(migrant_copy.id) self.programs[migrant_copy.id] = migrant_copy + + # Update island-specific best program if migrant is better + self._update_island_best_program(migrant_copy, target_island) - logger.debug( - f"Migrated program {migrant.id} from island {i} to island {target_island}" - ) + # Log migration with MAP-Elites coordinates + feature_coords = self._calculate_feature_coords(migrant_copy) + coords_dict = {self.config.feature_dimensions[j]: feature_coords[j] for j in range(len(feature_coords))} + logger.info("Program migrated to island %d at MAP-Elites coords: %s", + target_island, coords_dict) # Update last migration generation self.last_migration_generation = max(self.island_generations) logger.info(f"Migration completed at generation {self.last_migration_generation}") + + # Validate migration results + self._validate_migration_results() + + def _validate_migration_results(self) -> None: + """ + Validate migration didn't create inconsistencies + + Checks that: + 1. Program island metadata matches actual island assignment + 2. No programs are assigned to multiple islands + 3. All island best programs exist and are in correct islands + """ + seen_program_ids = set() + + for i, island in enumerate(self.islands): + for program_id in island: + # Check for duplicate assignments + if program_id in seen_program_ids: + logger.error(f"Program {program_id} assigned to multiple islands") + continue + seen_program_ids.add(program_id) + + # Check program exists + if program_id not in self.programs: + logger.warning(f"Island {i} contains nonexistent program {program_id}") + continue + + # Check metadata consistency + program = self.programs[program_id] + stored_island = program.metadata.get("island") + if stored_island != i: + logger.warning( + f"Island mismatch for program {program_id}: " + f"in island {i} but metadata says {stored_island}" + ) + + # Validate island best programs + for i, best_id in enumerate(self.island_best_programs): + if best_id is not None: + if best_id not in self.programs: + logger.warning(f"Island {i} best program {best_id} does not exist") + elif best_id not in self.islands[i]: + logger.warning(f"Island {i} best program {best_id} not in island") + + def _cleanup_stale_island_bests(self) -> None: + """ + Remove stale island best program references + + Cleans up references to programs that no longer exist in the database + or are not actually in their assigned islands. + """ + cleaned_count = 0 + + for i, best_id in enumerate(self.island_best_programs): + if best_id is not None: + should_clear = False + + # Check if program still exists + if best_id not in self.programs: + logger.debug(f"Clearing stale island {i} best program {best_id} (program deleted)") + should_clear = True + # Check if program is still in the island + elif best_id not in self.islands[i]: + logger.debug(f"Clearing stale island {i} best program {best_id} (not in island)") + should_clear = True + + if should_clear: + self.island_best_programs[i] = None + cleaned_count += 1 + + if cleaned_count > 0: + logger.info(f"Cleaned up {cleaned_count} stale island best program references") + + # Recalculate best programs for islands that were cleared + for i, best_id in enumerate(self.island_best_programs): + if best_id is None and len(self.islands[i]) > 0: + # Find new best program for this island + island_programs = [self.programs[pid] for pid in self.islands[i] if pid in self.programs] + if island_programs: + # Sort by fitness and update + best_program = max( + island_programs, + key=lambda p: p.metrics.get("combined_score", safe_numeric_average(p.metrics)) + ) + self.island_best_programs[i] = best_program.id + logger.debug(f"Recalculated island {i} best program: {best_program.id}") def get_island_stats(self) -> List[dict]: """Get statistics for each island""" @@ -1214,10 +1443,13 @@ def log_island_status(self) -> None: logger.info("Island Status:") for stat in stats: current_marker = " *" if stat["is_current"] else " " + island_idx = stat['island'] + island_best_id = self.island_best_programs[island_idx] if island_idx < len(self.island_best_programs) else None + best_indicator = f" (best: {island_best_id})" if island_best_id else "" logger.info( f"{current_marker} Island {stat['island']}: {stat['population_size']} programs, " f"best={stat['best_score']:.4f}, avg={stat['average_score']:.4f}, " - f"diversity={stat['diversity']:.2f}, gen={stat['generation']}" + f"diversity={stat['diversity']:.2f}, gen={stat['generation']}{best_indicator}" ) # Artifact storage and retrieval methods diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index dfe966f50..2ab93f361 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -89,10 +89,42 @@ def _load_evaluation_function(self) -> None: self.evaluate_function = module.evaluate logger.info(f"Successfully loaded evaluation function from {self.evaluation_file}") + + # Validate cascade configuration + self._validate_cascade_configuration(module) except Exception as e: logger.error(f"Error loading evaluation function: {str(e)}") raise + def _validate_cascade_configuration(self, module) -> None: + """ + Validate cascade evaluation configuration and warn about potential issues + + Args: + module: The loaded evaluation module + """ + if self.config.cascade_evaluation: + # Check if cascade functions exist + has_stage1 = hasattr(module, "evaluate_stage1") + has_stage2 = hasattr(module, "evaluate_stage2") + has_stage3 = hasattr(module, "evaluate_stage3") + + if not has_stage1: + logger.warning( + f"Configuration has 'cascade_evaluation: true' but evaluator " + f"'{self.evaluation_file}' does not define 'evaluate_stage1' function. " + f"This will fall back to direct evaluation, making the cascade setting useless. " + f"Consider setting 'cascade_evaluation: false' or implementing cascade functions." + ) + elif not (has_stage2 or has_stage3): + logger.warning( + f"Evaluator '{self.evaluation_file}' defines 'evaluate_stage1' but no additional " + f"cascade stages (evaluate_stage2, evaluate_stage3). Consider implementing " + f"multi-stage evaluation for better cascade benefits." + ) + else: + logger.debug(f"Cascade evaluation properly configured with available stage functions") + async def evaluate_program( self, program_code: str, @@ -273,7 +305,7 @@ def get_pending_artifacts(self, program_id: str) -> Optional[Dict[str, Union[str """ return self._pending_artifacts.pop(program_id, None) - async def _direct_evaluate(self, program_path: str) -> Dict[str, float]: + async def _direct_evaluate(self, program_path: str) -> Union[Dict[str, float], EvaluationResult]: """ Directly evaluate a program using the evaluation function with timeout @@ -281,7 +313,7 @@ async def _direct_evaluate(self, program_path: str) -> Dict[str, float]: program_path: Path to the program file Returns: - Dictionary of metric name to score + Dictionary of metrics or EvaluationResult with metrics and artifacts Raises: asyncio.TimeoutError: If evaluation exceeds timeout @@ -296,11 +328,8 @@ async def run_evaluation(): # Run the evaluation with timeout - let exceptions bubble up for retry handling result = await asyncio.wait_for(run_evaluation(), timeout=self.config.timeout) - # Validate result - if not isinstance(result, dict): - logger.warning(f"Evaluation returned non-dictionary result: {result}") - return {"error": 0.0} - + # Return result as-is to be processed by _process_evaluation_result + # This supports both dict and EvaluationResult returns, just like _cascade_evaluate return result async def _cascade_evaluate( @@ -354,13 +383,14 @@ async def run_stage1(): ) except Exception as e: logger.error(f"Error in stage 1 evaluation: {str(e)}") - # Capture stage 1 failure as artifacts + # Capture stage 1 failure with enhanced context + error_context = self._create_cascade_error_context("stage1", e) return EvaluationResult( metrics={"stage1_passed": 0.0, "error": 0.0}, artifacts={ "stderr": str(e), "traceback": traceback.format_exc(), - "failure_stage": "stage1", + **error_context, }, ) @@ -481,13 +511,14 @@ async def run_stage3(): except Exception as e: logger.error(f"Error in cascade evaluation: {str(e)}") - # Return proper cascade failure result instead of re-raising + # Return proper cascade failure result with enhanced context + error_context = self._create_cascade_error_context("cascade_setup", e) return EvaluationResult( metrics={"stage1_passed": 0.0, "error": 0.0}, artifacts={ "stderr": str(e), "traceback": traceback.format_exc(), - "failure_stage": "cascade_setup", + **error_context, }, ) @@ -582,6 +613,29 @@ async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[s traceback.print_exc() return {} + def _create_cascade_error_context(self, stage: str, error: Exception) -> dict: + """ + Create rich error context for cascade failures + + Args: + stage: The stage where the error occurred + error: The exception that was raised + + Returns: + Dictionary with enhanced error context + """ + import time + return { + "failure_stage": stage, + "error_type": type(error).__name__, + "error_message": str(error), + "timestamp": time.time(), + "cascade_config": self.config.cascade_evaluation, + "cascade_thresholds": getattr(self.config, 'cascade_thresholds', []), + "timeout_config": self.config.timeout, + "evaluation_file": self.evaluation_file, + } + def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool: """ Check if metrics pass a threshold diff --git a/openevolve/iteration.py b/openevolve/iteration.py index 98db88f09..11d3453a8 100644 --- a/openevolve/iteration.py +++ b/openevolve/iteration.py @@ -53,16 +53,18 @@ async def run_iteration_with_shared_db( # Get artifacts for the parent program if available parent_artifacts = database.get_artifacts(parent.id) - # Get actual top programs for prompt context (separate from inspirations) - actual_top_programs = database.get_top_programs(5) + # Get island-specific top programs for prompt context (maintain island isolation) + parent_island = parent.metadata.get("island", database.current_island) + island_top_programs = database.get_top_programs(5, island_idx=parent_island) + island_previous_programs = database.get_top_programs(3, island_idx=parent_island) # Build prompt prompt = prompt_sampler.build_prompt( current_program=parent.code, parent_program=parent.code, program_metrics=parent.metrics, - previous_programs=[p.to_dict() for p in database.get_top_programs(3)], - top_programs=[p.to_dict() for p in actual_top_programs], + previous_programs=[p.to_dict() for p in island_previous_programs], + top_programs=[p.to_dict() for p in island_top_programs], inspirations=[p.to_dict() for p in inspirations], language=config.language, evolution_round=iteration, diff --git a/pyproject.toml b/pyproject.toml index abe90c44f..cc41df178 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "openevolve" -version = "0.0.14" +version = "0.0.15" description = "Open-source implementation of AlphaEvolve" readme = "README.md" requires-python = ">=3.9" diff --git a/setup.py b/setup.py index e876b1c90..4db6920e8 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="openevolve", - version="0.0.14", + version="0.0.15", packages=find_packages(), include_package_data=True, ) diff --git a/tests/test_cascade_validation.py b/tests/test_cascade_validation.py new file mode 100644 index 000000000..0464b4278 --- /dev/null +++ b/tests/test_cascade_validation.py @@ -0,0 +1,301 @@ +""" +Tests for cascade evaluation validation functionality in openevolve.evaluator +""" + +import unittest +import tempfile +import os +from unittest.mock import patch, MagicMock +from openevolve.config import Config +from openevolve.evaluator import Evaluator +from openevolve.database import EvaluationResult + + +class TestCascadeValidation(unittest.TestCase): + """Tests for cascade evaluation configuration validation""" + + def setUp(self): + """Set up test evaluator with cascade validation""" + self.config = Config() + + # Create temporary evaluator files for testing + self.temp_dir = tempfile.mkdtemp() + + def tearDown(self): + """Clean up temporary files""" + # Clean up temp files + for file in os.listdir(self.temp_dir): + os.remove(os.path.join(self.temp_dir, file)) + os.rmdir(self.temp_dir) + + def _create_evaluator_file(self, filename: str, content: str) -> str: + """Helper to create temporary evaluator file""" + file_path = os.path.join(self.temp_dir, filename) + with open(file_path, 'w') as f: + f.write(content) + return file_path + + def test_cascade_validation_with_valid_evaluator(self): + """Test cascade validation with evaluator that has cascade functions""" + # Create evaluator with cascade functions + evaluator_content = ''' +def evaluate_stage1(program_path): + return {"stage1_score": 0.5} + +def evaluate_stage2(program_path): + return {"stage2_score": 0.7} + +def evaluate_stage3(program_path): + return {"stage3_score": 0.9} + +def evaluate(program_path): + return {"final_score": 1.0} +''' + evaluator_path = self._create_evaluator_file("valid_cascade.py", evaluator_content) + + # Configure for cascade evaluation + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = evaluator_path + + # Should not raise warnings for valid cascade evaluator + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should not have called warning + mock_logger.warning.assert_not_called() + + def test_cascade_validation_warning_for_missing_functions(self): + """Test cascade validation warns when cascade functions are missing""" + # Create evaluator without cascade functions + evaluator_content = ''' +def evaluate(program_path): + return {"score": 0.5} +''' + evaluator_path = self._create_evaluator_file("no_cascade.py", evaluator_content) + + # Configure for cascade evaluation + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = evaluator_path + + # Should warn about missing cascade functions + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should have warned about missing stage functions + mock_logger.warning.assert_called() + warning_call = mock_logger.warning.call_args[0][0] + self.assertIn("cascade_evaluation: true", warning_call) + self.assertIn("evaluate_stage1", warning_call) + + def test_cascade_validation_partial_functions(self): + """Test cascade validation with only some cascade functions""" + # Create evaluator with only stage1 + evaluator_content = ''' +def evaluate_stage1(program_path): + return {"stage1_score": 0.5} + +def evaluate(program_path): + return {"score": 0.5} +''' + evaluator_path = self._create_evaluator_file("partial_cascade.py", evaluator_content) + + # Configure for cascade evaluation + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = evaluator_path + + # Should not warn since stage1 exists (minimum requirement) + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should not warn since stage1 exists + mock_logger.warning.assert_not_called() + + def test_no_cascade_validation_when_disabled(self): + """Test no validation when cascade evaluation is disabled""" + # Create evaluator without cascade functions + evaluator_content = ''' +def evaluate(program_path): + return {"score": 0.5} +''' + evaluator_path = self._create_evaluator_file("no_cascade.py", evaluator_content) + + # Configure WITHOUT cascade evaluation + self.config.evaluator.cascade_evaluation = False + self.config.evaluator.evaluation_file = evaluator_path + + # Should not perform validation or warn + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should not warn when cascade evaluation is disabled + mock_logger.warning.assert_not_called() + + def test_direct_evaluate_supports_evaluation_result(self): + """Test that _direct_evaluate supports EvaluationResult returns""" + # Create evaluator that returns EvaluationResult + evaluator_content = ''' +from openevolve.database import EvaluationResult + +def evaluate(program_path): + return EvaluationResult( + metrics={"score": 0.8, "accuracy": 0.9}, + artifacts={"debug_info": "test data"} + ) +''' + evaluator_path = self._create_evaluator_file("result_evaluator.py", evaluator_content) + + self.config.evaluator.cascade_evaluation = False + self.config.evaluator.evaluation_file = evaluator_path + self.config.evaluator.timeout = 10 + + evaluator = Evaluator(self.config.evaluator, None) + + # Create a dummy program file + program_path = self._create_evaluator_file("test_program.py", "def test(): pass") + + # Mock the evaluation process + with patch('openevolve.evaluator.run_external_evaluator') as mock_run: + mock_run.return_value = EvaluationResult( + metrics={"score": 0.8, "accuracy": 0.9}, + artifacts={"debug_info": "test data"} + ) + + # Should handle EvaluationResult without issues + result = evaluator._direct_evaluate(program_path) + + # Should return the EvaluationResult as-is + self.assertIsInstance(result, EvaluationResult) + self.assertEqual(result.metrics["score"], 0.8) + self.assertEqual(result.artifacts["debug_info"], "test data") + + def test_direct_evaluate_supports_dict_result(self): + """Test that _direct_evaluate still supports dict returns""" + # Create evaluator that returns dict + evaluator_content = ''' +def evaluate(program_path): + return {"score": 0.7, "performance": 0.85} +''' + evaluator_path = self._create_evaluator_file("dict_evaluator.py", evaluator_content) + + self.config.evaluator.cascade_evaluation = False + self.config.evaluator.evaluation_file = evaluator_path + self.config.evaluator.timeout = 10 + + evaluator = Evaluator(self.config.evaluator, None) + + # Create a dummy program file + program_path = self._create_evaluator_file("test_program.py", "def test(): pass") + + # Mock the evaluation process + with patch('openevolve.evaluator.run_external_evaluator') as mock_run: + mock_run.return_value = {"score": 0.7, "performance": 0.85} + + # Should handle dict result without issues + result = evaluator._direct_evaluate(program_path) + + # Should return the dict as-is + self.assertIsInstance(result, dict) + self.assertEqual(result["score"], 0.7) + self.assertEqual(result["performance"], 0.85) + + def test_cascade_validation_with_class_based_evaluator(self): + """Test cascade validation with class-based evaluator""" + # Create class-based evaluator + evaluator_content = ''' +class Evaluator: + def evaluate_stage1(self, program_path): + return {"stage1_score": 0.5} + + def evaluate(self, program_path): + return {"score": 0.5} + +# Module-level functions (what validation looks for) +def evaluate_stage1(program_path): + evaluator = Evaluator() + return evaluator.evaluate_stage1(program_path) + +def evaluate(program_path): + evaluator = Evaluator() + return evaluator.evaluate(program_path) +''' + evaluator_path = self._create_evaluator_file("class_cascade.py", evaluator_content) + + # Configure for cascade evaluation + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = evaluator_path + + # Should not warn since module-level functions exist + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + mock_logger.warning.assert_not_called() + + def test_cascade_validation_with_syntax_error(self): + """Test cascade validation handles syntax errors gracefully""" + # Create evaluator with syntax error + evaluator_content = ''' +def evaluate_stage1(program_path) # Missing colon + return {"stage1_score": 0.5} +''' + evaluator_path = self._create_evaluator_file("syntax_error.py", evaluator_content) + + # Configure for cascade evaluation + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = evaluator_path + + # Should handle syntax error and still warn about cascade + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should have warned about missing functions (due to import failure) + mock_logger.warning.assert_called() + + def test_cascade_validation_nonexistent_file(self): + """Test cascade validation with nonexistent evaluator file""" + # Configure with nonexistent file + self.config.evaluator.cascade_evaluation = True + self.config.evaluator.evaluation_file = "/nonexistent/path.py" + + # Should handle missing file gracefully + with patch('openevolve.evaluator.logger') as mock_logger: + evaluator = Evaluator(self.config.evaluator, None) + + # Should have warned about missing functions (due to import failure) + mock_logger.warning.assert_called() + + def test_process_evaluation_result_with_artifacts(self): + """Test that _process_evaluation_result handles artifacts correctly""" + evaluator_path = self._create_evaluator_file("dummy.py", "def evaluate(p): pass") + + self.config.evaluator.evaluation_file = evaluator_path + evaluator = Evaluator(self.config.evaluator, None) + + # Test with EvaluationResult containing artifacts + eval_result = EvaluationResult( + metrics={"score": 0.9}, + artifacts={"log": "test log", "data": [1, 2, 3]} + ) + + metrics, artifacts = evaluator._process_evaluation_result(eval_result) + + self.assertEqual(metrics, {"score": 0.9}) + self.assertEqual(artifacts, {"log": "test log", "data": [1, 2, 3]}) + + def test_process_evaluation_result_with_dict(self): + """Test that _process_evaluation_result handles dict results correctly""" + evaluator_path = self._create_evaluator_file("dummy.py", "def evaluate(p): pass") + + self.config.evaluator.evaluation_file = evaluator_path + evaluator = Evaluator(self.config.evaluator, None) + + # Test with dict result + dict_result = {"score": 0.7, "accuracy": 0.8} + + metrics, artifacts = evaluator._process_evaluation_result(dict_result) + + self.assertEqual(metrics, {"score": 0.7, "accuracy": 0.8}) + self.assertEqual(artifacts, {}) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/tests/test_database.py b/tests/test_database.py index bfa35040c..883538eb3 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -80,6 +80,194 @@ def test_sample(self): self.assertIsNotNone(parent) self.assertIn(parent.id, ["test1", "test2"]) + def test_island_operations_basic(self): + """Test basic island operations""" + # Test with default islands (should be 5 by default) + self.assertEqual(len(self.db.islands), 5) + + program = Program( + id="island_test", + code="def island_test(): pass", + language="python", + metrics={"score": 0.6}, + ) + + self.db.add(program) + + # Should be in island 0 + self.assertIn("island_test", self.db.islands[0]) + self.assertEqual(program.metadata.get("island"), 0) + + def test_multi_island_setup(self): + """Test database with multiple islands""" + # Create new database with multiple islands + config = Config() + config.database.in_memory = True + config.database.num_islands = 3 + multi_db = ProgramDatabase(config.database) + + self.assertEqual(len(multi_db.islands), 3) + self.assertEqual(len(multi_db.island_best_programs), 3) + + # Add programs to specific islands + for i in range(3): + program = Program( + id=f"test_island_{i}", + code=f"def test_{i}(): pass", + language="python", + metrics={"score": 0.5 + i * 0.1}, + ) + multi_db.add(program, target_island=i) + + # Verify assignment + self.assertIn(f"test_island_{i}", multi_db.islands[i]) + self.assertEqual(program.metadata.get("island"), i) + + def test_feature_coordinates_calculation(self): + """Test MAP-Elites feature coordinate calculation""" + program = Program( + id="feature_test", + code="def test(): pass", # Short code + language="python", + metrics={"score": 0.8}, + ) + + coords = self.db._calculate_feature_coords(program) + + # Should return list of coordinates + self.assertIsInstance(coords, list) + self.assertEqual(len(coords), len(self.db.config.feature_dimensions)) + + # All coordinates should be within valid range + for coord in coords: + self.assertGreaterEqual(coord, 0) + self.assertLess(coord, self.db.feature_bins) + + def test_feature_map_operations(self): + """Test feature map operations for MAP-Elites""" + program1 = Program( + id="map_test1", + code="def short(): pass", # Similar complexity + language="python", + metrics={"score": 0.5}, + ) + + program2 = Program( + id="map_test2", + code="def also_short(): pass", # Similar complexity + language="python", + metrics={"score": 0.8}, # Better score + ) + + self.db.add(program1) + self.db.add(program2) + + # Both programs might land in same cell due to similar features + # The better program should be kept in the feature map + feature_coords1 = self.db._calculate_feature_coords(program1) + feature_coords2 = self.db._calculate_feature_coords(program2) + + key1 = self.db._feature_coords_to_key(feature_coords1) + key2 = self.db._feature_coords_to_key(feature_coords2) + + if key1 == key2: # Same cell + # Better program should be in feature map + self.assertEqual(self.db.feature_map[key1], "map_test2") + else: # Different cells + # Both should be in feature map + self.assertEqual(self.db.feature_map[key1], "map_test1") + self.assertEqual(self.db.feature_map[key2], "map_test2") + + def test_get_top_programs_with_metrics(self): + """Test get_top_programs with specific metrics""" + program1 = Program( + id="metric_test1", + code="def test1(): pass", + language="python", + metrics={"accuracy": 0.9, "speed": 0.3}, + ) + + program2 = Program( + id="metric_test2", + code="def test2(): pass", + language="python", + metrics={"accuracy": 0.7, "speed": 0.8}, + ) + + self.db.add(program1) + self.db.add(program2) + + # Test sorting by specific metric + top_by_accuracy = self.db.get_top_programs(n=2, metric="accuracy") + self.assertEqual(top_by_accuracy[0].id, "metric_test1") # Higher accuracy + + top_by_speed = self.db.get_top_programs(n=2, metric="speed") + self.assertEqual(top_by_speed[0].id, "metric_test2") # Higher speed + + def test_archive_operations(self): + """Test archive functionality""" + # Add programs that should go into archive + for i in range(5): + program = Program( + id=f"archive_test_{i}", + code=f"def test_{i}(): return {i}", + language="python", + metrics={"score": i * 0.1}, + ) + self.db.add(program) + + # Archive should contain program IDs + self.assertGreater(len(self.db.archive), 0) + self.assertLessEqual(len(self.db.archive), self.db.config.archive_size) + + # Archive should contain program IDs that exist + for program_id in self.db.archive: + self.assertIn(program_id, self.db.programs) + + def test_best_program_tracking(self): + """Test absolute best program tracking""" + program1 = Program( + id="best_test1", + code="def test1(): pass", + language="python", + metrics={"combined_score": 0.6}, + ) + + program2 = Program( + id="best_test2", + code="def test2(): pass", + language="python", + metrics={"combined_score": 0.9}, + ) + + self.db.add(program1) + self.assertEqual(self.db.best_program_id, "best_test1") + + self.db.add(program2) + self.assertEqual(self.db.best_program_id, "best_test2") # Should update to better program + + def test_population_limit_enforcement(self): + """Test population size limit enforcement""" + # Set small population limit + original_limit = self.db.config.population_size + self.db.config.population_size = 3 + + # Add more programs than limit + for i in range(5): + program = Program( + id=f"limit_test_{i}", + code=f"def test_{i}(): pass", + language="python", + metrics={"score": i * 0.1}, + ) + self.db.add(program) + + # Population should be at or below limit + self.assertLessEqual(len(self.db.programs), 3) + + # Restore original limit + self.db.config.population_size = original_limit + if __name__ == "__main__": unittest.main() diff --git a/tests/test_island_migration.py b/tests/test_island_migration.py new file mode 100644 index 000000000..efde4e37b --- /dev/null +++ b/tests/test_island_migration.py @@ -0,0 +1,252 @@ +""" +Tests for island migration functionality in openevolve.database +""" + +import unittest +from openevolve.config import Config +from openevolve.database import Program, ProgramDatabase + + +class TestIslandMigration(unittest.TestCase): + """Tests for island migration in program database""" + + def setUp(self): + """Set up test database with multiple islands""" + config = Config() + config.database.in_memory = True + config.database.num_islands = 3 + config.database.migration_rate = 0.5 # 50% of programs migrate + config.database.migration_generations = 5 # Migrate every 5 generations + self.db = ProgramDatabase(config.database) + + def _create_test_program(self, program_id: str, score: float, island: int) -> Program: + """Helper to create a test program""" + program = Program( + id=program_id, + code=f"def func_{program_id}(): return {score}", + language="python", + metrics={"score": score, "combined_score": score}, + metadata={"island": island} + ) + return program + + def test_initial_island_setup(self): + """Test that islands are properly initialized""" + self.assertEqual(len(self.db.islands), 3) + self.assertEqual(len(self.db.island_best_programs), 3) + self.assertEqual(len(self.db.island_generations), 3) + + # All islands should be empty initially + for island in self.db.islands: + self.assertEqual(len(island), 0) + + # All island best programs should be None initially + for best_id in self.db.island_best_programs: + self.assertIsNone(best_id) + + def test_program_island_assignment(self): + """Test that programs are assigned to correct islands""" + # Add programs to specific islands + program1 = self._create_test_program("test1", 0.5, 0) + program2 = self._create_test_program("test2", 0.7, 1) + program3 = self._create_test_program("test3", 0.3, 2) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=1) + self.db.add(program3, target_island=2) + + # Verify island assignments + self.assertIn("test1", self.db.islands[0]) + self.assertIn("test2", self.db.islands[1]) + self.assertIn("test3", self.db.islands[2]) + + # Verify metadata + self.assertEqual(self.db.programs["test1"].metadata["island"], 0) + self.assertEqual(self.db.programs["test2"].metadata["island"], 1) + self.assertEqual(self.db.programs["test3"].metadata["island"], 2) + + def test_should_migrate_logic(self): + """Test the migration timing logic""" + # Initially should not migrate (no generations passed) + self.assertFalse(self.db.should_migrate()) + + # Advance island generations + self.db.island_generations = [5, 6, 7] # All above threshold + self.assertTrue(self.db.should_migrate()) + + # Test with mixed generations + self.db.island_generations = [3, 6, 2] # Only one above threshold + self.assertFalse(self.db.should_migrate()) + + def test_migration_ring_topology(self): + """Test that migration follows ring topology""" + # Add programs to islands 0 and 1 + program1 = self._create_test_program("test1", 0.8, 0) + program2 = self._create_test_program("test2", 0.6, 1) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=1) + + # Set up for migration + self.db.island_generations = [6, 6, 6] # Trigger migration + + initial_program_count = len(self.db.programs) + + # Perform migration + self.db.migrate_programs() + + # Should have created migrant copies + self.assertGreater(len(self.db.programs), initial_program_count) + + # Check that migrants were created with proper naming + migrant_ids = [pid for pid in self.db.programs.keys() if "_migrant_" in pid] + self.assertGreater(len(migrant_ids), 0) + + # Verify ring topology: island 0 -> islands 1,2; island 1 -> islands 2,0 + island_0_migrants = [pid for pid in migrant_ids if "test1_migrant_" in pid] + island_1_migrants = [pid for pid in migrant_ids if "test2_migrant_" in pid] + + # test1 should migrate to islands 1 and 2 + self.assertTrue(any("_1" in pid for pid in island_0_migrants)) + self.assertTrue(any("_2" in pid for pid in island_0_migrants)) + + # test2 should migrate to islands 2 and 0 + self.assertTrue(any("_2" in pid for pid in island_1_migrants)) + self.assertTrue(any("_0" in pid for pid in island_1_migrants)) + + def test_migration_rate_respected(self): + """Test that migration rate is properly applied""" + # Add multiple programs to island 0 + programs = [] + for i in range(10): + program = self._create_test_program(f"test{i}", 0.5 + i * 0.05, 0) + programs.append(program) + self.db.add(program, target_island=0) + + # Set up for migration + self.db.island_generations = [6, 6, 6] + + initial_count = len(self.db.programs) + + # Perform migration + self.db.migrate_programs() + + # Calculate expected migrants + # With 50% migration rate and 10 programs, expect 5 migrants + # Each migrant goes to 2 target islands, so 10 total new programs + expected_new_programs = 5 * 2 # 5 migrants * 2 target islands each + actual_new_programs = len(self.db.programs) - initial_count + + self.assertEqual(actual_new_programs, expected_new_programs) + + def test_migration_preserves_best_programs(self): + """Test that migration selects the best programs for migration""" + # Add programs with different scores to island 0 + program1 = self._create_test_program("low_score", 0.2, 0) + program2 = self._create_test_program("high_score", 0.9, 0) + program3 = self._create_test_program("med_score", 0.5, 0) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=0) + self.db.add(program3, target_island=0) + + # Set up for migration + self.db.island_generations = [6, 6, 6] + + # Perform migration + self.db.migrate_programs() + + # Check that the high-score program was selected for migration + migrant_ids = [pid for pid in self.db.programs.keys() if "_migrant_" in pid] + high_score_migrants = [pid for pid in migrant_ids if "high_score_migrant_" in pid] + + self.assertGreater(len(high_score_migrants), 0) + + def test_migration_updates_generations(self): + """Test that migration updates the last migration generation""" + # Add a program and set up for migration + program = self._create_test_program("test1", 0.5, 0) + self.db.add(program, target_island=0) + + self.db.island_generations = [6, 7, 8] + initial_migration_gen = self.db.last_migration_generation + + # Perform migration + self.db.migrate_programs() + + # Should update to max of island generations + self.assertEqual(self.db.last_migration_generation, 8) + self.assertGreater(self.db.last_migration_generation, initial_migration_gen) + + def test_migration_with_empty_islands(self): + """Test that migration handles empty islands gracefully""" + # Add program only to island 0, leave others empty + program = self._create_test_program("test1", 0.5, 0) + self.db.add(program, target_island=0) + + # Set up for migration + self.db.island_generations = [6, 6, 6] + + # Should not crash with empty islands + try: + self.db.migrate_programs() + except Exception as e: + self.fail(f"Migration with empty islands should not crash: {e}") + + def test_migration_creates_proper_copies(self): + """Test that migration creates proper program copies""" + program = self._create_test_program("original", 0.7, 0) + self.db.add(program, target_island=0) + + # Set up for migration + self.db.island_generations = [6, 6, 6] + + # Perform migration + self.db.migrate_programs() + + # Find migrant copies + migrant_ids = [pid for pid in self.db.programs.keys() if "original_migrant_" in pid] + self.assertGreater(len(migrant_ids), 0) + + # Check migrant properties + for migrant_id in migrant_ids: + migrant = self.db.programs[migrant_id] + + # Should have same code and metrics + self.assertEqual(migrant.code, program.code) + self.assertEqual(migrant.metrics, program.metrics) + + # Should have proper parent reference + self.assertEqual(migrant.parent_id, "original") + + # Should be marked as migrant + self.assertTrue(migrant.metadata.get("migrant", False)) + + # Should be in correct target island + target_island = migrant.metadata["island"] + self.assertIn(migrant_id, self.db.islands[target_island]) + + def test_no_migration_with_single_island(self): + """Test that migration is skipped with single island""" + # Create database with single island + config = Config() + config.database.in_memory = True + config.database.num_islands = 1 + single_island_db = ProgramDatabase(config.database) + + program = self._create_test_program("test1", 0.5, 0) + single_island_db.add(program, target_island=0) + + single_island_db.island_generations = [6] + + initial_count = len(single_island_db.programs) + + # Should not perform migration + single_island_db.migrate_programs() + + # Program count should remain the same + self.assertEqual(len(single_island_db.programs), initial_count) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/tests/test_island_tracking.py b/tests/test_island_tracking.py new file mode 100644 index 000000000..28723da1f --- /dev/null +++ b/tests/test_island_tracking.py @@ -0,0 +1,266 @@ +""" +Tests for island best program tracking functionality in openevolve.database +""" + +import unittest +from openevolve.config import Config +from openevolve.database import Program, ProgramDatabase + + +class TestIslandTracking(unittest.TestCase): + """Tests for island best program tracking in program database""" + + def setUp(self): + """Set up test database with multiple islands""" + config = Config() + config.database.in_memory = True + config.database.num_islands = 3 + self.db = ProgramDatabase(config.database) + + def _create_test_program(self, program_id: str, score: float, island: int) -> Program: + """Helper to create a test program""" + program = Program( + id=program_id, + code=f"def func_{program_id}(): return {score}", + language="python", + metrics={"score": score, "combined_score": score}, + metadata={"island": island} + ) + return program + + def test_initial_island_best_tracking(self): + """Test initial state of island best program tracking""" + # Initially all island best programs should be None + self.assertEqual(len(self.db.island_best_programs), 3) + for best_id in self.db.island_best_programs: + self.assertIsNone(best_id) + + def test_first_program_becomes_island_best(self): + """Test that the first program added to an island becomes the best""" + program = self._create_test_program("first", 0.5, 0) + self.db.add(program, target_island=0) + + # Should become the best program for island 0 + self.assertEqual(self.db.island_best_programs[0], "first") + + # Other islands should still have None + self.assertIsNone(self.db.island_best_programs[1]) + self.assertIsNone(self.db.island_best_programs[2]) + + def test_better_program_updates_island_best(self): + """Test that a better program replaces the island best""" + # Add initial program + program1 = self._create_test_program("mediocre", 0.5, 0) + self.db.add(program1, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "mediocre") + + # Add better program + program2 = self._create_test_program("better", 0.8, 0) + self.db.add(program2, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "better") + + def test_worse_program_does_not_update_island_best(self): + """Test that a worse program does not replace the island best""" + # Add good program + program1 = self._create_test_program("good", 0.8, 0) + self.db.add(program1, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "good") + + # Add worse program + program2 = self._create_test_program("worse", 0.3, 0) + self.db.add(program2, target_island=0) + + # Should still be the good program + self.assertEqual(self.db.island_best_programs[0], "good") + + def test_island_isolation_in_best_tracking(self): + """Test that island best tracking is isolated between islands""" + # Add programs to different islands + program1 = self._create_test_program("island0_best", 0.9, 0) + program2 = self._create_test_program("island1_best", 0.7, 1) + program3 = self._create_test_program("island2_best", 0.5, 2) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=1) + self.db.add(program3, target_island=2) + + # Each island should track its own best + self.assertEqual(self.db.island_best_programs[0], "island0_best") + self.assertEqual(self.db.island_best_programs[1], "island1_best") + self.assertEqual(self.db.island_best_programs[2], "island2_best") + + def test_migration_updates_island_best(self): + """Test that migration can update island best programs""" + # Add program to island 0 + original = self._create_test_program("original", 0.6, 0) + self.db.add(original, target_island=0) + + # Island 1 starts empty + self.assertIsNone(self.db.island_best_programs[1]) + + # Manually create a migrant to island 1 (simulating migration) + migrant = Program( + id="original_migrant_1", + code=original.code, + language=original.language, + parent_id=original.id, + generation=original.generation, + metrics=original.metrics.copy(), + metadata={"island": 1, "migrant": True} + ) + + # Add migrant to island 1 + self.db.add(migrant, target_island=1) + + # Should become best for island 1 + self.assertEqual(self.db.island_best_programs[1], "original_migrant_1") + + def test_get_top_programs_island_specific(self): + """Test getting top programs from a specific island""" + # Add programs to island 0 + program1 = self._create_test_program("prog1", 0.9, 0) + program2 = self._create_test_program("prog2", 0.7, 0) + program3 = self._create_test_program("prog3", 0.5, 0) + + # Add programs to island 1 + program4 = self._create_test_program("prog4", 0.8, 1) + program5 = self._create_test_program("prog5", 0.6, 1) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=0) + self.db.add(program3, target_island=0) + self.db.add(program4, target_island=1) + self.db.add(program5, target_island=1) + + # Get top programs from island 0 + island0_top = self.db.get_top_programs(n=2, island_idx=0) + self.assertEqual(len(island0_top), 2) + self.assertEqual(island0_top[0].id, "prog1") # Highest score + self.assertEqual(island0_top[1].id, "prog2") # Second highest + + # Get top programs from island 1 + island1_top = self.db.get_top_programs(n=2, island_idx=1) + self.assertEqual(len(island1_top), 2) + self.assertEqual(island1_top[0].id, "prog4") # Highest score in island 1 + self.assertEqual(island1_top[1].id, "prog5") # Second highest in island 1 + + def test_island_best_with_combined_score(self): + """Test island best tracking with combined_score metric""" + # Add programs with combined_score + program1 = Program( + id="test1", + code="def test1(): pass", + language="python", + metrics={"score": 0.5, "other": 0.3, "combined_score": 0.4}, + metadata={"island": 0} + ) + + program2 = Program( + id="test2", + code="def test2(): pass", + language="python", + metrics={"score": 0.3, "other": 0.7, "combined_score": 0.5}, + metadata={"island": 0} + ) + + self.db.add(program1, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "test1") + + # program2 has higher combined_score, should become best + self.db.add(program2, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "test2") + + def test_island_best_with_missing_program(self): + """Test island best tracking when best program is removed""" + program = self._create_test_program("to_remove", 0.8, 0) + self.db.add(program, target_island=0) + self.assertEqual(self.db.island_best_programs[0], "to_remove") + + # Manually remove the program (simulating cleanup) + del self.db.programs["to_remove"] + self.db.islands[0].remove("to_remove") + + # Add a new program - should detect stale reference and update + new_program = self._create_test_program("new", 0.6, 0) + self.db.add(new_program, target_island=0) + + # Should update the best program (the old one is gone) + self.assertEqual(self.db.island_best_programs[0], "new") + + def test_sample_inspirations_from_island(self): + """Test that inspiration sampling respects island boundaries""" + # Add programs to island 0 + program1 = self._create_test_program("island0_prog1", 0.9, 0) + program2 = self._create_test_program("island0_prog2", 0.7, 0) + + # Add programs to island 1 + program3 = self._create_test_program("island1_prog1", 0.8, 1) + program4 = self._create_test_program("island1_prog2", 0.6, 1) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=0) + self.db.add(program3, target_island=1) + self.db.add(program4, target_island=1) + + # Sample from island 0 program + inspirations = self.db._sample_inspirations(program1, n=5) + + # All inspirations should be from island 0 + for inspiration in inspirations: + island = inspiration.metadata.get("island") + self.assertEqual(island, 0, f"Program {inspiration.id} should be from island 0, got {island}") + + def test_island_status_logging(self): + """Test island status logging functionality""" + # Add programs to different islands + program1 = self._create_test_program("p1", 0.9, 0) + program2 = self._create_test_program("p2", 0.7, 1) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=1) + + # Should not crash when logging status + try: + self.db.log_island_status() + except Exception as e: + self.fail(f"Island status logging should not crash: {e}") + + def test_island_best_persistence(self): + """Test that island best programs are maintained across operations""" + # Add programs to islands + program1 = self._create_test_program("best0", 0.9, 0) + program2 = self._create_test_program("best1", 0.8, 1) + + self.db.add(program1, target_island=0) + self.db.add(program2, target_island=1) + + # Verify initial state + self.assertEqual(self.db.island_best_programs[0], "best0") + self.assertEqual(self.db.island_best_programs[1], "best1") + + # Add more programs that are not better + program3 = self._create_test_program("worse0", 0.5, 0) + program4 = self._create_test_program("worse1", 0.4, 1) + + self.db.add(program3, target_island=0) + self.db.add(program4, target_island=1) + + # Best should remain unchanged + self.assertEqual(self.db.island_best_programs[0], "best0") + self.assertEqual(self.db.island_best_programs[1], "best1") + + def test_invalid_island_index_handling(self): + """Test handling of invalid island indices""" + # Test with island index out of bounds + with self.assertRaises(IndexError): + self.db.get_top_programs(n=5, island_idx=10) + + def test_empty_island_top_programs(self): + """Test getting top programs from empty island""" + # Island 0 is empty initially + top_programs = self.db.get_top_programs(n=5, island_idx=0) + self.assertEqual(len(top_programs), 0) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file