fixes

codelion · codelion · commit d795ac496f61 · 2025-08-22T10:33:59.000+08:00
diff --git a/examples/function_minimization/config.yaml b/examples/function_minimization/config.yaml
@@ -1,27 +1,23 @@
 # Configuration for function minimization example
-max_iterations: 100
-checkpoint_interval: 10
-log_level: "INFO"
+max_iterations: 50
+checkpoint_interval: 5
 
 # LLM configuration
 llm:
-  # primary_model: "gemini-2.0-flash-lite"
-  primary_model: "llama3.1-8b"
-  primary_model_weight: 0.8
-  # secondary_model: "gemini-2.0-flash"
-  secondary_model: "llama-4-scout-17b-16e-instruct"
-  secondary_model_weight: 0.2
-  # api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
-  api_base: "https://api.cerebras.ai/v1"
-  temperature: 0.7
-  top_p: 0.95
-  max_tokens: 4096
+  primary_model: "gemini-2.5-flash-lite"
+  # primary_model: "llama3.1-8b"
+  primary_model_weight: 0.9
+  secondary_model: "gemini-2.5-flash"
+  # secondary_model: "llama-4-scout-17b-16e-instruct"
+  secondary_model_weight: 0.1
+  api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
+  # api_base: "https://api.cerebras.ai/v1"
+  temperature: 0.4
+  max_tokens: 4000
 
 # Prompt configuration
 prompt:
   system_message: "You are an expert programmer specializing in optimization algorithms. Your task is to improve a function minimization algorithm to find the global minimum of a complex function with many local minima. The function is f(x, y) = sin(x) * cos(y) + sin(x*y) + (x^2 + y^2)/20. Focus on improving the search_algorithm function to reliably find the global minimum, escaping local minima that might trap simple algorithms."
-  num_top_programs: 3
-  use_template_stochasticity: true
 
 # Database configuration
 database:
@@ -34,11 +30,9 @@ database:
 # Evaluator configuration
 evaluator:
   timeout: 60
-  cascade_evaluation: true
   cascade_thresholds: [0.5, 0.75]
-  parallel_evaluations: 4
-  use_llm_feedback: false
+  parallel_evaluations: 3
 
 # Evolution settings
-diff_based_evolution: true
-allow_full_rewrites: false
+diff_based_evolution: false
+max_code_length: 20000
diff --git a/examples/function_minimization/evaluator.py b/examples/function_minimization/evaluator.py
@@ -69,7 +69,7 @@ def evaluate(program_path):
             return {
                 "value_score": 0.0,
                 "distance_score": 0.0,
-                "speed_score": 0.0,
+                "reliability_score": 0.0,
                 "combined_score": 0.0,
                 "error": "Missing run_search function",
             }
@@ -162,7 +162,7 @@ def evaluate(program_path):
             return {
                 "value_score": 0.0,
                 "distance_score": 0.0,
-                "speed_score": 0.0,
+                "reliability_score": 0.0,
                 "combined_score": 0.0,
                 "error": "All trials failed",
             }
@@ -173,65 +173,40 @@ def evaluate(program_path):
         avg_time = float(np.mean(times)) if times else 1.0
 
         # Convert to scores (higher is better)
-        value_score = float(1.0 / (1.0 + abs(avg_value - GLOBAL_MIN_VALUE)))  # Normalize and invert
+        value_score = float(1.0 / (1.0 + abs(avg_value - GLOBAL_MIN_VALUE)))
         distance_score = float(1.0 / (1.0 + avg_distance))
-        speed_score = float(1.0 / avg_time) if avg_time > 0 else 0.0
-
-        # calculate standard deviation scores
-        # get x_std_score
-        x_std_score = float(1.0 / (1.0 + np.std(x_values)))
-        # get y_std_score
-        y_std_score = float(1.0 / (1.0 + np.std(y_values)))
-        standard_deviation_score = (x_std_score + y_std_score) / 2.0
-
-        # Normalize speed score (so it doesn't dominate)
-        speed_score = float(min(speed_score, 10.0) / 10.0)
-
+        
         # Add reliability score based on success rate
         reliability_score = float(success_count / num_trials)
 
-        # Calculate a single combined score that prioritizes finding good solutions
-        # over secondary metrics like speed and reliability
-        # Value and distance scores (quality of solution) get 90% of the weight
-        # Speed and reliability get only 10% combined
-        combined_score = float(
-            0.35 * value_score
-            + 0.35 * distance_score
-            + standard_deviation_score * 0.20
-            + 0.05 * speed_score
-            + 0.05 * reliability_score
-        )
-
-        # Also compute an "overall" score that will be the primary metric for selection
-        # This adds a bonus for finding solutions close to the global minimum
-        # and heavily penalizes solutions that aren't finding the right region
-        if distance_to_global < 1.0:  # Very close to the correct solution
-            solution_quality = 1.0
-        elif distance_to_global < 3.0:  # In the right region
-            solution_quality = 0.5
+        # Calculate solution quality based on distance to global minimum
+        if avg_distance < 0.5:  # Very close to the correct solution
+            solution_quality_multiplier = 1.5  # 50% bonus
+        elif avg_distance < 1.5:  # In the right region
+            solution_quality_multiplier = 1.2  # 20% bonus
+        elif avg_distance < 3.0:  # Getting closer
+            solution_quality_multiplier = 1.0  # No adjustment
         else:  # Not finding the right region
-            solution_quality = 0.1
+            solution_quality_multiplier = 0.7  # 30% penalty
 
-        # Overall score is dominated by solution quality but also factors in the combined score
-        overall_score = 0.8 * solution_quality + 0.2 * combined_score
+        # Calculate combined score that prioritizes finding the global minimum
+        # Base score from value and distance, then apply solution quality multiplier
+        base_score = 0.5 * value_score + 0.3 * distance_score + 0.2 * reliability_score
+        combined_score = float(base_score * solution_quality_multiplier)
 
         return {
             "value_score": value_score,
             "distance_score": distance_score,
-            "standard_deviation_score": standard_deviation_score,
-            "speed_score": speed_score,
             "reliability_score": reliability_score,
             "combined_score": combined_score,
-            "overall_score": overall_score,  # This will be the primary selection metric
-            "success_rate": reliability_score,
         }
     except Exception as e:
         print(f"Evaluation failed completely: {str(e)}")
         print(traceback.format_exc())
         return {
             "value_score": 0.0,
             "distance_score": 0.0,
-            "speed_score": 0.0,
+            "reliability_score": 0.0,
             "combined_score": 0.0,
             "error": str(e),
         }
@@ -255,7 +230,11 @@ def evaluate_stage1(program_path):
         # Check if the required function exists
         if not hasattr(program, "run_search"):
             print(f"Stage 1 validation: Program does not have 'run_search' function")
-            return {"runs_successfully": 0.0, "error": "Missing run_search function"}
+            return {
+                "runs_successfully": 0.0, 
+                "combined_score": 0.0,
+                "error": "Missing run_search function"
+            }
 
         try:
             # Run a single trial with timeout
@@ -275,10 +254,18 @@ def evaluate_stage1(program_path):
                     print(
                         f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
                     )
-                    return {"runs_successfully": 0.0, "error": "Invalid result format"}
+                    return {
+                        "runs_successfully": 0.0, 
+                        "combined_score": 0.0,
+                        "error": "Invalid result format"
+                    }
             else:
                 print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}")
-                return {"runs_successfully": 0.0, "error": "Invalid result format"}
+                return {
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "Invalid result format"
+                }
 
             # Ensure all values are float
             x = safe_float(x)
@@ -295,7 +282,11 @@ def evaluate_stage1(program_path):
                 or np.isinf(value)
             ):
                 print(f"Stage 1 validation: Invalid result, got x={x}, y={y}, value={value}")
-                return {"runs_successfully": 0.5, "error": "Invalid result values"}
+                return {
+                    "runs_successfully": 0.5, 
+                    "combined_score": 0.0,
+                    "error": "Invalid result values"
+                }
 
             # Calculate distance safely
             x_diff = float(x) - GLOBAL_MIN_X
@@ -306,38 +297,59 @@ def evaluate_stage1(program_path):
             value_score = float(1.0 / (1.0 + abs(value - GLOBAL_MIN_VALUE)))
             distance_score = float(1.0 / (1.0 + distance))
 
-            # Calculate solution quality metric
-            if distance < 1.0:  # Very close to the correct solution
-                solution_quality = 1.0
-            elif distance < 3.0:  # In the right region
-                solution_quality = 0.5
+            # Calculate solution quality based on distance to global minimum
+            if distance < 0.5:  # Very close to the correct solution
+                solution_quality_multiplier = 1.4  # 40% bonus
+            elif distance < 1.5:  # In the right region
+                solution_quality_multiplier = 1.15  # 15% bonus
+            elif distance < 3.0:  # Getting closer
+                solution_quality_multiplier = 1.0  # No adjustment
             else:  # Not finding the right region
-                solution_quality = 0.1
+                solution_quality_multiplier = 0.8  # 20% penalty
+
+            # Calculate combined score for stage 1
+            base_score = 0.6 * value_score + 0.4 * distance_score
+            combined_score = float(base_score * solution_quality_multiplier)
 
-            # Basic metrics with overall score
             return {
                 "runs_successfully": 1.0,
                 "value_score": value_score,
                 "distance_score": distance_score,
-                "overall_score": solution_quality,  # This becomes a strong guiding metric
+                "combined_score": combined_score,
             }
         except TimeoutError as e:
             print(f"Stage 1 evaluation timed out: {e}")
-            return {"runs_successfully": 0.0, "error": "Timeout"}
+            return {
+                "runs_successfully": 0.0, 
+                "combined_score": 0.0,
+                "error": "Timeout"
+            }
         except IndexError as e:
             # Specifically handle IndexError which often happens with early termination checks
             print(f"Stage 1 evaluation failed with IndexError: {e}")
             print("This is likely due to a list index check before the list is fully populated.")
-            return {"runs_successfully": 0.0, "error": f"IndexError: {str(e)}"}
+            return {
+                "runs_successfully": 0.0, 
+                "combined_score": 0.0,
+                "error": f"IndexError: {str(e)}"
+            }
         except Exception as e:
             print(f"Stage 1 evaluation failed: {e}")
             print(traceback.format_exc())
-            return {"runs_successfully": 0.0, "error": str(e)}
+            return {
+                "runs_successfully": 0.0, 
+                "combined_score": 0.0,
+                "error": str(e)
+            }
 
     except Exception as e:
         print(f"Stage 1 evaluation failed: {e}")
         print(traceback.format_exc())
-        return {"runs_successfully": 0.0, "error": str(e)}
+        return {
+            "runs_successfully": 0.0, 
+            "combined_score": 0.0,
+            "error": str(e)
+        }
 
 
 def evaluate_stage2(program_path):
diff --git a/openevolve/config.py b/openevolve/config.py
@@ -56,12 +56,7 @@ class LLMConfig(LLMModelConfig):
     retry_delay: int = 5
 
     # n-model configuration for evolution LLM ensemble
-    models: List[LLMModelConfig] = field(
-        default_factory=lambda: [
-            LLMModelConfig(name="gpt-4o-mini", weight=0.8),
-            LLMModelConfig(name="gpt-4o", weight=0.2),
-        ]
-    )
+    models: List[LLMModelConfig] = field(default_factory=list)
 
     # n-model configuration for evaluator LLM ensemble
     evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: [])
@@ -75,24 +70,34 @@ class LLMConfig(LLMModelConfig):
     def __post_init__(self):
         """Post-initialization to set up model configurations"""
         # Handle backward compatibility for primary_model(_weight) and secondary_model(_weight).
-        if (self.primary_model or self.primary_model_weight) and len(self.models) < 1:
-            # Ensure we have a primary model
-            self.models.append(LLMModelConfig())
         if self.primary_model:
-            self.models[0].name = self.primary_model
-        if self.primary_model_weight:
-            self.models[0].weight = self.primary_model_weight
+            # Create primary model
+            primary_model = LLMModelConfig(
+                name=self.primary_model,
+                weight=self.primary_model_weight or 1.0
+            )
+            self.models.append(primary_model)
 
-        if (self.secondary_model or self.secondary_model_weight) and len(self.models) < 2:
-            # Ensure we have a second model
-            self.models.append(LLMModelConfig())
         if self.secondary_model:
-            self.models[1].name = self.secondary_model
-        if self.secondary_model_weight:
-            self.models[1].weight = self.secondary_model_weight
+            # Create secondary model (only if weight > 0)
+            if not self.secondary_model_weight or self.secondary_model_weight > 0:
+                secondary_model = LLMModelConfig(
+                    name=self.secondary_model,
+                    weight=self.secondary_model_weight or 0.2
+                )
+                self.models.append(secondary_model)
+
+        # Only validate if this looks like a user config (has some model info)
+        # Don't validate during internal/default initialization
+        if (self.primary_model or self.secondary_model or 
+            self.primary_model_weight or self.secondary_model_weight) and not self.models:
+            raise ValueError(
+                "No LLM models configured. Please specify 'models' array or "
+                "'primary_model' in your configuration."
+            )
 
         # If no evaluator models are defined, use the same models as for evolution
-        if not self.evaluator_models or len(self.evaluator_models) < 1:
+        if not self.evaluator_models:
             self.evaluator_models = self.models.copy()
 
         # Update models with shared configuration values
diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py
@@ -178,12 +178,26 @@ def _run_iteration_worker(
         iteration_start = time.time()
 
         # Generate code modification (sync wrapper for async)
-        llm_response = asyncio.run(
-            _worker_llm_ensemble.generate_with_context(
-                system_message=prompt["system"],
-                messages=[{"role": "user", "content": prompt["user"]}],
+        try:
+            llm_response = asyncio.run(
+                _worker_llm_ensemble.generate_with_context(
+                    system_message=prompt["system"],
+                    messages=[{"role": "user", "content": prompt["user"]}],
+                )
+            )
+        except Exception as e:
+            logger.error(f"LLM generation failed: {e}")
+            return SerializableResult(
+                error=f"LLM generation failed: {str(e)}", 
+                iteration=iteration
+            )
+
+        # Check for None response
+        if llm_response is None:
+            return SerializableResult(
+                error="LLM returned None response", 
+                iteration=iteration
             )
-        )
 
         # Parse response based on evolution mode
         if _worker_config.diff_based_evolution: