algorithmicsuperintelligence
diff --git a/‎openevolve/config.py‎
Lines changed: 18 additions & 11 deletions b/‎openevolve/config.py‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎openevolve/controller.py‎
Lines changed: 18 additions & 5 deletions b/‎openevolve/controller.py‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎openevolve/database.py‎
Lines changed: 21 additions & 14 deletions b/‎openevolve/database.py‎
Lines changed: 21 additions & 14 deletions
diff --git a/‎openevolve/evaluation_result.py‎
Lines changed: 2 additions & 2 deletions b/‎openevolve/evaluation_result.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎openevolve/evaluator.py‎
Lines changed: 4 additions & 2 deletions b/‎openevolve/evaluator.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎openevolve/llm/openai.py‎
Lines changed: 11 additions & 7 deletions b/‎openevolve/llm/openai.py‎
Lines changed: 11 additions & 7 deletions
@@ -73,8 +73,7 @@ def __post_init__(self):
         if self.primary_model:
             # Create primary model
             primary_model = LLMModelConfig(
-                name=self.primary_model,
-                weight=self.primary_model_weight or 1.0
+                name=self.primary_model, weight=self.primary_model_weight or 1.0
             )
             self.models.append(primary_model)
 
@@ -83,14 +82,22 @@ def __post_init__(self):
             if self.secondary_model_weight is None or self.secondary_model_weight > 0:
                 secondary_model = LLMModelConfig(
                     name=self.secondary_model,
-                    weight=self.secondary_model_weight if self.secondary_model_weight is not None else 0.2
+                    weight=(
+                        self.secondary_model_weight
+                        if self.secondary_model_weight is not None
+                        else 0.2
+                    ),
                 )
                 self.models.append(secondary_model)
 
         # Only validate if this looks like a user config (has some model info)
         # Don't validate during internal/default initialization
-        if (self.primary_model or self.secondary_model or 
-            self.primary_model_weight or self.secondary_model_weight) and not self.models:
+        if (
+            self.primary_model
+            or self.secondary_model
+            or self.primary_model_weight
+            or self.secondary_model_weight
+        ) and not self.models:
             raise ValueError(
                 "No LLM models configured. Please specify 'models' array or "
                 "'primary_model' in your configuration."
@@ -198,11 +205,11 @@ class DatabaseConfig:
         default_factory=lambda: ["complexity", "diversity"],
         metadata={
             "help": "List of feature dimensions for MAP-Elites grid. "
-                   "Built-in dimensions: 'complexity', 'diversity', 'score'. "
-                   "Custom dimensions: Must match metric names from evaluator. "
-                   "IMPORTANT: Evaluators must return raw continuous values for custom dimensions, "
-                   "NOT pre-computed bin indices. OpenEvolve handles all scaling and binning internally."
-        }
+            "Built-in dimensions: 'complexity', 'diversity', 'score'. "
+            "Custom dimensions: Must match metric names from evaluator. "
+            "IMPORTANT: Evaluators must return raw continuous values for custom dimensions, "
+            "NOT pre-computed bin indices. OpenEvolve handles all scaling and binning internally."
+        },
     )
     feature_bins: Union[int, Dict[str, int]] = 10  # Can be int (all dims) or dict (per-dim)
     diversity_reference_size: int = 20  # Size of reference set for diversity calculation
@@ -271,7 +278,7 @@ class Config:
     # Evolution settings
     diff_based_evolution: bool = True
     max_code_length: int = 10000
-    
+
     # Early stopping settings
     early_stopping_patience: Optional[int] = None
     convergence_threshold: float = 0.001
 
@@ -353,10 +353,20 @@ def force_exit_handler(signum, frame):
                     best_program = best_by_combined
 
         if best_program:
-            logger.info(
-                f"Evolution complete. Best program has metrics: "
-                f"{format_metrics_safe(best_program.metrics)}"
-            )
+            if (
+                hasattr(self, "parallel_controller")
+                and self.parallel_controller
+                and self.parallel_controller.early_stopping_triggered
+            ):
+                logger.info(
+                    f"🛑 Evolution complete via early stopping. Best program has metrics: "
+                    f"{format_metrics_safe(best_program.metrics)}"
+                )
+            else:
+                logger.info(
+                    f"Evolution complete. Best program has metrics: "
+                    f"{format_metrics_safe(best_program.metrics)}"
+                )
             self._save_best_program(best_program)
             return best_program
         else:
@@ -467,10 +477,13 @@ async def _run_evolution_with_checkpoints(
             start_iteration, max_iterations, target_score, checkpoint_callback=self._save_checkpoint
         )
 
-        # Check if shutdown was requested
+        # Check if shutdown or early stopping was triggered
         if self.parallel_controller.shutdown_event.is_set():
             logger.info("Evolution stopped due to shutdown request")
             return
+        elif self.parallel_controller.early_stopping_triggered:
+            logger.info("Evolution stopped due to early stopping - saving final checkpoint")
+            # Continue to save final checkpoint for early stopping
 
         # Save final checkpoint if needed
         # Note: start_iteration here is the evolution start (1 for fresh start, not 0)
 
@@ -248,7 +248,9 @@ def add(
                 if existing_program_id in self.programs:
                     existing_program = self.programs[existing_program_id]
                     new_fitness = get_fitness_score(program.metrics, self.config.feature_dimensions)
-                    existing_fitness = get_fitness_score(existing_program.metrics, self.config.feature_dimensions)
+                    existing_fitness = get_fitness_score(
+                        existing_program.metrics, self.config.feature_dimensions
+                    )
                     logger.info(
                         "MAP-Elites cell improved: %s (fitness: %.3f -> %.3f)",
                         coords_dict,
@@ -290,7 +292,7 @@ def add(
         else:
             # No parent and no target specified, use current island
             island_idx = self.current_island
-        
+
         island_idx = island_idx % len(self.islands)  # Ensure valid island
         self.islands[island_idx].add(program.id)
 
@@ -547,7 +549,7 @@ def load(self, path: str) -> None:
             self.current_island = metadata.get("current_island", 0)
             self.island_generations = metadata.get("island_generations", [0] * len(saved_islands))
             self.last_migration_generation = metadata.get("last_migration_generation", 0)
-            
+
             # Load feature_stats for MAP-Elites grid stability
             self.feature_stats = self._deserialize_feature_stats(metadata.get("feature_stats", {}))
 
@@ -839,7 +841,7 @@ def _feature_coords_to_key(self, coords: List[int]) -> str:
     def _is_better(self, program1: Program, program2: Program) -> bool:
         """
         Determine if program1 has better FITNESS than program2
-        
+
         Uses fitness calculation that excludes MAP-Elites feature dimensions
         to prevent pollution of fitness comparisons.
 
@@ -901,7 +903,8 @@ def _update_archive(self, program: Program) -> None:
         # Find worst program among valid programs
         if valid_archive_programs:
             worst_program = min(
-                valid_archive_programs, key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions)
+                valid_archive_programs,
+                key=lambda p: get_fitness_score(p.metrics, self.config.feature_dimensions),
             )
 
             # Replace if new program is better
@@ -1848,7 +1851,7 @@ def _scale_feature_value_minmax(self, feature_name: str, value: float) -> float:
     def _serialize_feature_stats(self) -> Dict[str, Any]:
         """
         Serialize feature_stats for JSON storage
-        
+
         Returns:
             Dictionary that can be JSON-serialized
         """
@@ -1866,26 +1869,28 @@ def _serialize_feature_stats(self) -> Dict[str, Any]:
                         serialized_stats[key] = value
                 else:
                     # Convert numpy types to Python native types
-                    if hasattr(value, 'item'):  # numpy scalar
+                    if hasattr(value, "item"):  # numpy scalar
                         serialized_stats[key] = value.item()
                     else:
                         serialized_stats[key] = value
             serialized[feature_name] = serialized_stats
         return serialized
-    
-    def _deserialize_feature_stats(self, stats_dict: Dict[str, Any]) -> Dict[str, Dict[str, Union[float, List[float]]]]:
+
+    def _deserialize_feature_stats(
+        self, stats_dict: Dict[str, Any]
+    ) -> Dict[str, Dict[str, Union[float, List[float]]]]:
         """
         Deserialize feature_stats from loaded JSON
-        
+
         Args:
             stats_dict: Dictionary loaded from JSON
-            
+
         Returns:
             Properly formatted feature_stats dictionary
         """
         if not stats_dict:
             return {}
-            
+
         deserialized = {}
         for feature_name, stats in stats_dict.items():
             if isinstance(stats, dict):
@@ -1897,8 +1902,10 @@ def _deserialize_feature_stats(self, stats_dict: Dict[str, Any]) -> Dict[str, Di
                 }
                 deserialized[feature_name] = deserialized_stats
             else:
-                logger.warning(f"Skipping malformed feature_stats entry for '{feature_name}': {stats}")
-        
+                logger.warning(
+                    f"Skipping malformed feature_stats entry for '{feature_name}': {stats}"
+                )
+
         return deserialized
 
     def log_island_status(self) -> None:
 
@@ -15,8 +15,8 @@ class EvaluationResult:
     This maintains backward compatibility with the existing dict[str, float] contract
     while adding a side-channel for arbitrary artifacts (text or binary data).
 
-    IMPORTANT: For custom MAP-Elites features, metrics values must be raw continuous 
-    scores (e.g., actual counts, percentages, continuous measurements), NOT pre-computed 
+    IMPORTANT: For custom MAP-Elites features, metrics values must be raw continuous
+    scores (e.g., actual counts, percentages, continuous measurements), NOT pre-computed
     bin indices. The database handles all binning internally using min-max scaling.
 
     Examples:
 
@@ -44,7 +44,7 @@ def __init__(
         llm_ensemble: Optional[LLMEnsemble] = None,
         prompt_sampler: Optional[PromptSampler] = None,
         database: Optional[ProgramDatabase] = None,
-        suffix: Optional[str]=".py",
+        suffix: Optional[str] = ".py",
     ):
         self.config = config
         self.evaluation_file = evaluation_file
@@ -565,7 +565,9 @@ async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[s
             # Create prompt for LLM
             feature_dimensions = self.database.config.feature_dimensions if self.database else []
             prompt = self.prompt_sampler.build_prompt(
-                current_program=program_code, template_key="evaluation", feature_dimensions=feature_dimensions
+                current_program=program_code,
+                template_key="evaluation",
+                feature_dimensions=feature_dimensions,
             )
 
             # Get LLM response
 
@@ -70,20 +70,24 @@ async def generate_with_context(
         # These models don't support temperature/top_p and use different parameters
         OPENAI_REASONING_MODEL_PREFIXES = (
             # O-series reasoning models
-            "o1-", "o1",  # o1, o1-mini, o1-preview
-            "o3-", "o3",  # o3, o3-mini, o3-pro  
-            "o4-",        # o4-mini
+            "o1-",
+            "o1",  # o1, o1-mini, o1-preview
+            "o3-",
+            "o3",  # o3, o3-mini, o3-pro
+            "o4-",  # o4-mini
             # GPT-5 series are also reasoning models
-            "gpt-5-", "gpt-5",  # gpt-5, gpt-5-mini, gpt-5-nano
+            "gpt-5-",
+            "gpt-5",  # gpt-5, gpt-5-mini, gpt-5-nano
             # The GPT OSS series are also reasoning models
-            "gpt-oss-120b", "gpt-oss-20b"
+            "gpt-oss-120b",
+            "gpt-oss-20b",
         )
 
         # Check if this is an OpenAI reasoning model
         model_lower = str(self.model).lower()
         is_openai_reasoning_model = (
-            self.api_base == "https://api.openai.com/v1" and 
-            model_lower.startswith(OPENAI_REASONING_MODEL_PREFIXES)
+            self.api_base == "https://api.openai.com/v1"
+            and model_lower.startswith(OPENAI_REASONING_MODEL_PREFIXES)
         )
 
         if is_openai_reasoning_model: