Improve population management and reproducibility in evolution

codelion · codelion · commit d8680732850b · 2025-07-10T13:41:45.000+08:00
Refactors population size enforcement in ProgramDatabase to protect newly added and best programs, and ensures tracked best program references are valid. Enhances random seed propagation for reproducibility, improves logging consistency, and applies minor code style and logic cleanups across controller, LLM ensemble, prompt sampler, and threaded parallel modules.
diff --git a/openevolve/config.py b/openevolve/config.py
@@ -32,7 +32,7 @@ class LLMModelConfig:
     timeout: int = None
     retries: int = None
     retry_delay: int = None
-    
+
     # Reproducibility
     random_seed: Optional[int] = None
 
@@ -56,10 +56,12 @@ class LLMConfig(LLMModelConfig):
     retry_delay: int = 5
 
     # n-model configuration for evolution LLM ensemble
-    models: List[LLMModelConfig] = field(default_factory=lambda: [
-        LLMModelConfig(name="gpt-4o-mini", weight=0.8),
-        LLMModelConfig(name="gpt-4o", weight=0.2)
-    ])
+    models: List[LLMModelConfig] = field(
+        default_factory=lambda: [
+            LLMModelConfig(name="gpt-4o-mini", weight=0.8),
+            LLMModelConfig(name="gpt-4o", weight=0.2),
+        ]
+    )
 
     # n-model configuration for evaluator LLM ensemble
     evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: [])
@@ -264,7 +266,7 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> "Config":
             config.prompt = PromptConfig(**config_dict["prompt"])
         if "database" in config_dict:
             config.database = DatabaseConfig(**config_dict["database"])
-        
+
         # Ensure database inherits the random seed if not explicitly set
         if config.database.random_seed is None and config.random_seed is not None:
             config.database.random_seed = config.random_seed
@@ -365,4 +367,4 @@ def load_config(config_path: Optional[Union[str, Path]] = None) -> Config:
     # Make the system message available to the individual models, in case it is not provided from the prompt sampler
     config.llm.update_model_params({"system_message": config.prompt.system_message})
 
-    return config
+    return config
diff --git a/openevolve/controller.py b/openevolve/controller.py
@@ -104,20 +104,20 @@ def __init__(
             # Set global random seeds
             random.seed(self.config.random_seed)
             np.random.seed(self.config.random_seed)
-            
+
             # Create hash-based seeds for different components
-            base_seed = str(self.config.random_seed).encode('utf-8')
-            llm_seed = int(hashlib.md5(base_seed + b'llm').hexdigest()[:8], 16) % (2**31)
-            
+            base_seed = str(self.config.random_seed).encode("utf-8")
+            llm_seed = int(hashlib.md5(base_seed + b"llm").hexdigest()[:8], 16) % (2**31)
+
             # Propagate seed to LLM configurations
             self.config.llm.random_seed = llm_seed
             for model_cfg in self.config.llm.models:
-                if not hasattr(model_cfg, 'random_seed') or model_cfg.random_seed is None:
+                if not hasattr(model_cfg, "random_seed") or model_cfg.random_seed is None:
                     model_cfg.random_seed = llm_seed
             for model_cfg in self.config.llm.evaluator_models:
-                if not hasattr(model_cfg, 'random_seed') or model_cfg.random_seed is None:
+                if not hasattr(model_cfg, "random_seed") or model_cfg.random_seed is None:
                     model_cfg.random_seed = llm_seed
-            
+
             logger.info(f"Set random seed to {self.config.random_seed} for reproducibility")
             logger.debug(f"Generated LLM seed: {llm_seed}")
 
@@ -161,7 +161,7 @@ def __init__(
         self.evaluation_file = evaluation_file
 
         logger.info(f"Initialized OpenEvolve with {initial_program_path}")
-        
+
         # Initialize improved parallel processing components
         self.parallel_controller = None
 
@@ -212,7 +212,7 @@ async def run(
             Best program found
         """
         max_iterations = iterations or self.config.max_iterations
-        
+
         # Determine starting iteration
         start_iteration = 0
         if checkpoint_path and os.path.exists(checkpoint_path):
@@ -260,30 +260,31 @@ async def run(
             self.parallel_controller = ImprovedParallelController(
                 self.config, self.evaluation_file, self.database
             )
-            
+
             # Set up signal handlers for graceful shutdown
             def signal_handler(signum, frame):
                 logger.info(f"Received signal {signum}, initiating graceful shutdown...")
                 self.parallel_controller.request_shutdown()
-                
+
                 # Set up a secondary handler for immediate exit if user presses Ctrl+C again
                 def force_exit_handler(signum, frame):
                     logger.info("Force exit requested - terminating immediately")
                     import sys
+
                     sys.exit(0)
-                
+
                 signal.signal(signal.SIGINT, force_exit_handler)
-            
+
             signal.signal(signal.SIGINT, signal_handler)
             signal.signal(signal.SIGTERM, signal_handler)
-            
+
             self.parallel_controller.start()
-            
+
             # Run evolution with improved parallel processing and checkpoint callback
             await self._run_evolution_with_checkpoints(
                 start_iteration, max_iterations, target_score
             )
-            
+
         finally:
             # Clean up parallel processing resources
             if self.parallel_controller:
@@ -420,31 +421,28 @@ def _load_checkpoint(self, checkpoint_path: str) -> None:
         """Load state from a checkpoint directory"""
         if not os.path.exists(checkpoint_path):
             raise FileNotFoundError(f"Checkpoint directory {checkpoint_path} not found")
-        
+
         logger.info(f"Loading checkpoint from {checkpoint_path}")
         self.database.load(checkpoint_path)
-        logger.info(
-            f"Checkpoint loaded successfully (iteration {self.database.last_iteration})"
-        )
+        logger.info(f"Checkpoint loaded successfully (iteration {self.database.last_iteration})")
 
     async def _run_evolution_with_checkpoints(
         self, start_iteration: int, max_iterations: int, target_score: Optional[float]
     ) -> None:
         """Run evolution with checkpoint saving support"""
         logger.info(f"Using island-based evolution with {self.config.database.num_islands} islands")
         self.database.log_island_status()
-        
+
         # Run the evolution process with checkpoint callback
         await self.parallel_controller.run_evolution(
-            start_iteration, max_iterations, target_score,
-            checkpoint_callback=self._save_checkpoint
+            start_iteration, max_iterations, target_score, checkpoint_callback=self._save_checkpoint
         )
-        
+
         # Check if shutdown was requested
         if self.parallel_controller.shutdown_flag.is_set():
             logger.info("Evolution stopped due to shutdown request")
             return
-        
+
         # Save final checkpoint if needed
         final_iteration = start_iteration + max_iterations - 1
         if final_iteration > 0 and final_iteration % self.config.checkpoint_interval == 0:
@@ -499,4 +497,4 @@ def _save_best_program(self, program: Optional[Program] = None) -> None:
                 indent=2,
             )
 
-        logger.info(f"Saved best program to {code_path} with program info to {info_path}")
+        logger.info(f"Saved best program to {code_path} with program info to {info_path}")
diff --git a/openevolve/database.py b/openevolve/database.py
@@ -9,6 +9,7 @@
 import random
 import time
 from dataclasses import asdict, dataclass, field, fields
+
 # FileLock removed - no longer needed with threaded parallel processing
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
@@ -198,7 +199,11 @@ def add(
         # Update archive
         self._update_archive(program)
 
-        # Update the absolute best program tracking
+        # Enforce population size limit BEFORE updating best program tracking
+        # This ensures newly added programs aren't immediately removed
+        self._enforce_population_limit(exclude_program_id=program.id)
+
+        # Update the absolute best program tracking (after population enforcement)
         self._update_best_program(program)
 
         # Save to disk if configured
@@ -207,9 +212,6 @@ def add(
 
         logger.debug(f"Added program {program.id} to island {island_idx}")
 
-        # Enforce population size limit
-        self._enforce_population_limit()
-
         return program.id
 
     def get(self, program_id: str) -> Optional[Program]:
@@ -254,9 +256,15 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]:
             return None
 
         # If no specific metric and we have a tracked best program, return it
-        if metric is None and self.best_program_id and self.best_program_id in self.programs:
-            logger.debug(f"Using tracked best program: {self.best_program_id}")
-            return self.programs[self.best_program_id]
+        if metric is None and self.best_program_id:
+            if self.best_program_id in self.programs:
+                logger.debug(f"Using tracked best program: {self.best_program_id}")
+                return self.programs[self.best_program_id]
+            else:
+                logger.warning(
+                    f"Tracked best program {self.best_program_id} no longer exists, will recalculate"
+                )
+                self.best_program_id = None
 
         if metric:
             # Sort by specific metric
@@ -713,7 +721,15 @@ def _update_best_program(self, program: Program) -> None:
             logger.debug(f"Set initial best program to {program.id}")
             return
 
-        # Compare with current best program
+        # Compare with current best program (if it still exists)
+        if self.best_program_id not in self.programs:
+            logger.warning(
+                f"Best program {self.best_program_id} no longer exists, clearing reference"
+            )
+            self.best_program_id = program.id
+            logger.info(f"Set new best program to {program.id}")
+            return
+
         current_best = self.programs[self.best_program_id]
 
         # Update if the new program is better
@@ -940,9 +956,12 @@ def _sample_inspirations(self, parent: Program, n: int = 5) -> List[Program]:
 
         return inspirations[:n]
 
-    def _enforce_population_limit(self) -> None:
+    def _enforce_population_limit(self, exclude_program_id: Optional[str] = None) -> None:
         """
         Enforce the population size limit by removing worst programs if needed
+
+        Args:
+            exclude_program_id: Program ID to never remove (e.g., newly added program)
         """
         if len(self.programs) <= self.config.population_size:
             return
@@ -963,22 +982,24 @@ def _enforce_population_limit(self) -> None:
             key=lambda p: safe_numeric_average(p.metrics),
         )
 
-        # Remove worst programs, but never remove the best program
+        # Remove worst programs, but never remove the best program or excluded program
         programs_to_remove = []
+        protected_ids = {self.best_program_id, exclude_program_id} - {None}
+
         for program in sorted_programs:
             if len(programs_to_remove) >= num_to_remove:
                 break
-            # Don't remove the best program
-            if program.id != self.best_program_id:
+            # Don't remove the best program or excluded program
+            if program.id not in protected_ids:
                 programs_to_remove.append(program)
 
-        # If we still need to remove more and only have the best program protected,
-        # remove from the remaining programs anyway (but keep the absolute best)
+        # If we still need to remove more and only have protected programs,
+        # remove from the remaining programs anyway (but keep the protected ones)
         if len(programs_to_remove) < num_to_remove:
             remaining_programs = [
                 p
                 for p in sorted_programs
-                if p not in programs_to_remove and p.id != self.best_program_id
+                if p not in programs_to_remove and p.id not in protected_ids
             ]
             additional_removals = remaining_programs[: num_to_remove - len(programs_to_remove)]
             programs_to_remove.extend(additional_removals)
diff --git a/openevolve/iteration.py b/openevolve/iteration.py
@@ -31,24 +31,21 @@ class Result:
     artifacts: dict = None
 
 
-
-
-
 async def run_iteration_with_shared_db(
-    iteration: int, 
-    config: Config, 
+    iteration: int,
+    config: Config,
     database: ProgramDatabase,
     evaluator: Evaluator,
     llm_ensemble: LLMEnsemble,
-    prompt_sampler: PromptSampler
+    prompt_sampler: PromptSampler,
 ):
     """
     Run a single iteration using shared memory database
-    
+
     This is optimized for use with persistent worker processes.
     """
     logger = logging.getLogger(__name__)
-    
+
     try:
         # Sample parent and inspirations from database
         parent, inspirations = database.sample()
@@ -115,10 +112,10 @@ async def run_iteration_with_shared_db(
         # Evaluate the child program
         child_id = str(uuid.uuid4())
         result.child_metrics = await evaluator.evaluate_program(child_code, child_id)
-        
+
         # Handle artifacts if they exist
         artifacts = evaluator.get_pending_artifacts(child_id)
-        
+
         # Create a child program
         result.child_program = Program(
             id=child_id,
@@ -133,13 +130,13 @@ async def run_iteration_with_shared_db(
                 "parent_metrics": parent.metrics,
             },
         )
-        
+
         result.prompt = prompt
         result.llm_response = llm_response
         result.artifacts = artifacts
         result.iteration_time = time.time() - iteration_start
         result.iteration = iteration
-        
+
         return result
 
     except Exception as e:
diff --git a/openevolve/llm/ensemble.py b/openevolve/llm/ensemble.py
@@ -27,16 +27,22 @@ def __init__(self, models_cfg: List[LLMModelConfig]):
         self.weights = [model.weight for model in models_cfg]
         total = sum(self.weights)
         self.weights = [w / total for w in self.weights]
-        
+
         # Set up random state for deterministic model selection
         self.random_state = random.Random()
         # Initialize with seed from first model's config if available
-        if models_cfg and hasattr(models_cfg[0], 'random_seed') and models_cfg[0].random_seed is not None:
+        if (
+            models_cfg
+            and hasattr(models_cfg[0], "random_seed")
+            and models_cfg[0].random_seed is not None
+        ):
             self.random_state.seed(models_cfg[0].random_seed)
-            logger.debug(f"LLMEnsemble: Set random seed to {models_cfg[0].random_seed} for deterministic model selection")
+            logger.debug(
+                f"LLMEnsemble: Set random seed to {models_cfg[0].random_seed} for deterministic model selection"
+            )
 
         # Only log if we have multiple models or this is the first ensemble
-        if len(models_cfg) > 1 or not hasattr(logger, '_ensemble_logged'):
+        if len(models_cfg) > 1 or not hasattr(logger, "_ensemble_logged"):
             logger.info(
                 f"Initialized LLM ensemble with models: "
                 + ", ".join(
diff --git a/openevolve/llm/openai.py b/openevolve/llm/openai.py
@@ -32,7 +32,7 @@ def __init__(
         self.retry_delay = model_cfg.retry_delay
         self.api_base = model_cfg.api_base
         self.api_key = model_cfg.api_key
-        self.random_seed = getattr(model_cfg, 'random_seed', None)
+        self.random_seed = getattr(model_cfg, "random_seed", None)
 
         # Set up API client
         self.client = openai.OpenAI(
@@ -41,9 +41,9 @@ def __init__(
         )
 
         # Only log unique models to reduce duplication
-        if not hasattr(logger, '_initialized_models'):
+        if not hasattr(logger, "_initialized_models"):
             logger._initialized_models = set()
-        
+
         if self.model not in logger._initialized_models:
             logger.info(f"Initialized OpenAI LLM with model: {self.model}")
             logger._initialized_models.add(self.model)
@@ -80,7 +80,7 @@ async def generate_with_context(
                 "top_p": kwargs.get("top_p", self.top_p),
                 "max_tokens": kwargs.get("max_tokens", self.max_tokens),
             }
-        
+
         # Add seed parameter for reproducibility if configured
         # Skip seed parameter for Google AI Studio endpoint as it doesn't support it
         seed = kwargs.get("seed", self.random_seed)
diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py
diff --git a/openevolve/threaded_parallel.py b/openevolve/threaded_parallel.py