f

codelion · codelion · commit 78e153bb8a81 · 2025-08-22T11:17:42.000+08:00
diff --git a/examples/function_minimization/config.yaml b/examples/function_minimization/config.yaml
@@ -12,8 +12,8 @@ llm:
   secondary_model_weight: 0.2
   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
   # api_base: "https://api.cerebras.ai/v1"
-  temperature: 0.6
-  max_tokens: 10000
+  temperature: 0.7
+  max_tokens: 16000
   timeout: 120
 
 # Prompt configuration
@@ -31,7 +31,7 @@ database:
 # Evaluator configuration
 evaluator:
   timeout: 60
-  cascade_thresholds: [1.45]
+  cascade_thresholds: [1.4]
   parallel_evaluations: 3
 
 # Evolution settings
diff --git a/examples/function_minimization/evaluator.py b/examples/function_minimization/evaluator.py
@@ -8,6 +8,7 @@
 import concurrent.futures
 import traceback
 import signal
+from openevolve.evaluation_result import EvaluationResult
 
 
 def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5):
@@ -66,13 +67,23 @@ def evaluate(program_path):
         # Check if the required function exists
         if not hasattr(program, "run_search"):
             print(f"Error: program does not have 'run_search' function")
-            return {
-                "value_score": 0.0,
-                "distance_score": 0.0,
-                "reliability_score": 0.0,
-                "combined_score": 0.0,
-                "error": "Missing run_search function",
+            
+            error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Program is missing required 'run_search' function",
+                "suggestion": "Make sure your program includes a function named 'run_search' that returns (x, y, value) or (x, y)"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "value_score": 0.0,
+                    "distance_score": 0.0,
+                    "reliability_score": 0.0,
+                    "combined_score": 0.0,
+                    "error": "Missing run_search function",
+                },
+                artifacts=error_artifacts
+            )
 
         # Run multiple trials
         num_trials = 10
@@ -159,13 +170,22 @@ def evaluate(program_path):
 
         # If all trials failed, return zero scores
         if success_count == 0:
-            return {
-                "value_score": 0.0,
-                "distance_score": 0.0,
-                "reliability_score": 0.0,
-                "combined_score": 0.0,
-                "error": "All trials failed",
+            error_artifacts = {
+                "error_type": "AllTrialsFailed",
+                "error_message": f"All {num_trials} trials failed - common issues: timeouts, crashes, or invalid return values",
+                "suggestion": "Check for infinite loops, ensure function returns (x, y) or (x, y, value), and verify algorithm terminates within time limit"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "value_score": 0.0,
+                    "distance_score": 0.0,
+                    "reliability_score": 0.0,
+                    "combined_score": 0.0,
+                    "error": "All trials failed",
+                },
+                artifacts=error_artifacts
+            )
 
         # Calculate metrics
         avg_value = float(np.mean(values))
@@ -194,22 +214,45 @@ def evaluate(program_path):
         base_score = 0.5 * value_score + 0.3 * distance_score + 0.2 * reliability_score
         combined_score = float(base_score * solution_quality_multiplier)
 
-        return {
-            "value_score": value_score,
-            "distance_score": distance_score,
-            "reliability_score": reliability_score,
-            "combined_score": combined_score,
+        # Add artifacts for successful runs
+        artifacts = {
+            "convergence_info": f"Converged in {num_trials} trials with {success_count} successes",
+            "best_position": f"Final position: x={x_values[-1]:.4f}, y={y_values[-1]:.4f}" if x_values else "No successful trials",
+            "average_distance_to_global": f"{avg_distance:.4f}",
+            "search_efficiency": f"Success rate: {reliability_score:.2%}"
         }
+
+        return EvaluationResult(
+            metrics={
+                "value_score": value_score,
+                "distance_score": distance_score,
+                "reliability_score": reliability_score,
+                "combined_score": combined_score,
+            },
+            artifacts=artifacts
+        )
     except Exception as e:
         print(f"Evaluation failed completely: {str(e)}")
         print(traceback.format_exc())
-        return {
-            "value_score": 0.0,
-            "distance_score": 0.0,
-            "reliability_score": 0.0,
-            "combined_score": 0.0,
-            "error": str(e),
+        
+        # Create error artifacts
+        error_artifacts = {
+            "error_type": type(e).__name__,
+            "error_message": str(e),
+            "full_traceback": traceback.format_exc(),
+            "suggestion": "Check for syntax errors or missing imports in the generated code"
         }
+        
+        return EvaluationResult(
+            metrics={
+                "value_score": 0.0,
+                "distance_score": 0.0,
+                "reliability_score": 0.0,
+                "combined_score": 0.0,
+                "error": str(e),
+            },
+            artifacts=error_artifacts
+        )
 
 
 # Stage-based evaluation for cascade evaluation
@@ -230,11 +273,21 @@ def evaluate_stage1(program_path):
         # Check if the required function exists
         if not hasattr(program, "run_search"):
             print(f"Stage 1 validation: Program does not have 'run_search' function")
-            return {
-                "runs_successfully": 0.0, 
-                "combined_score": 0.0,
-                "error": "Missing run_search function"
+            
+            error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Stage 1: Program is missing required 'run_search' function",
+                "suggestion": "Make sure your program includes a function named 'run_search' that returns (x, y, value) or (x, y)"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "Missing run_search function"
+                },
+                artifacts=error_artifacts
+            )
 
         try:
             # Run a single trial with timeout
@@ -254,18 +307,38 @@ def evaluate_stage1(program_path):
                     print(
                         f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
                     )
-                    return {
-                        "runs_successfully": 0.0, 
-                        "combined_score": 0.0,
-                        "error": "Invalid result format"
+                    
+                    error_artifacts = {
+                        "error_type": "InvalidReturnFormat",
+                        "error_message": f"Stage 1: Function returned tuple with {len(result)} values, expected 2 or 3",
+                        "suggestion": "run_search() must return (x, y) or (x, y, value) - check your return statement"
                     }
+                    
+                    return EvaluationResult(
+                        metrics={
+                            "runs_successfully": 0.0, 
+                            "combined_score": 0.0,
+                            "error": "Invalid result format"
+                        },
+                        artifacts=error_artifacts
+                    )
             else:
                 print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}")
-                return {
-                    "runs_successfully": 0.0, 
-                    "combined_score": 0.0,
-                    "error": "Invalid result format"
+                
+                error_artifacts = {
+                    "error_type": "InvalidReturnType",
+                    "error_message": f"Stage 1: Function returned {type(result)}, expected tuple",
+                    "suggestion": "run_search() must return a tuple like (x, y) or (x, y, value), not a single value or other type"
                 }
+                
+                return EvaluationResult(
+                    metrics={
+                        "runs_successfully": 0.0, 
+                        "combined_score": 0.0,
+                        "error": "Invalid result format"
+                    },
+                    artifacts=error_artifacts
+                )
 
             # Ensure all values are float
             x = safe_float(x)
@@ -282,11 +355,21 @@ def evaluate_stage1(program_path):
                 or np.isinf(value)
             ):
                 print(f"Stage 1 validation: Invalid result, got x={x}, y={y}, value={value}")
-                return {
-                    "runs_successfully": 0.5, 
-                    "combined_score": 0.0,
-                    "error": "Invalid result values"
+                
+                error_artifacts = {
+                    "error_type": "InvalidResultValues",
+                    "error_message": f"Stage 1: Got invalid values - x={x}, y={y}, value={value}",
+                    "suggestion": "Function returned NaN or infinite values. Check for division by zero, invalid math operations, or uninitialized variables"
                 }
+                
+                return EvaluationResult(
+                    metrics={
+                        "runs_successfully": 0.5, 
+                        "combined_score": 0.0,
+                        "error": "Invalid result values"
+                    },
+                    artifacts=error_artifacts
+                )
 
             # Calculate distance safely
             x_diff = float(x) - GLOBAL_MIN_X
@@ -311,45 +394,97 @@ def evaluate_stage1(program_path):
             base_score = 0.6 * value_score + 0.4 * distance_score
             combined_score = float(base_score * solution_quality_multiplier)
 
-            return {
-                "runs_successfully": 1.0,
-                "value_score": value_score,
-                "distance_score": distance_score,
-                "combined_score": combined_score,
+            # Add artifacts for successful stage 1
+            stage1_artifacts = {
+                "stage1_result": f"Found solution at x={x:.4f}, y={y:.4f} with value={value:.4f}",
+                "distance_to_global": f"{distance:.4f}",
+                "solution_quality": f"Distance < 0.5: Very close" if distance < 0.5 else f"Distance < 1.5: Good region" if distance < 1.5 else "Could be improved"
             }
+
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 1.0,
+                    "value_score": value_score,
+                    "distance_score": distance_score,
+                    "combined_score": combined_score,
+                },
+                artifacts=stage1_artifacts
+            )
         except TimeoutError as e:
             print(f"Stage 1 evaluation timed out: {e}")
-            return {
-                "runs_successfully": 0.0, 
-                "combined_score": 0.0,
-                "error": "Timeout"
+            
+            error_artifacts = {
+                "error_type": "TimeoutError",
+                "error_message": "Stage 1: Function execution exceeded 5 second timeout",
+                "suggestion": "Function is likely stuck in infinite loop or doing too much computation. Try reducing iterations or adding early termination conditions"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "Timeout"
+                },
+                artifacts=error_artifacts
+            )
         except IndexError as e:
             # Specifically handle IndexError which often happens with early termination checks
             print(f"Stage 1 evaluation failed with IndexError: {e}")
             print("This is likely due to a list index check before the list is fully populated.")
-            return {
-                "runs_successfully": 0.0, 
-                "combined_score": 0.0,
-                "error": f"IndexError: {str(e)}"
+            
+            error_artifacts = {
+                "error_type": "IndexError",
+                "error_message": f"Stage 1: {str(e)}",
+                "suggestion": "List index out of range - likely accessing empty list or wrong index. Check list initialization and bounds"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": f"IndexError: {str(e)}"
+                },
+                artifacts=error_artifacts
+            )
         except Exception as e:
             print(f"Stage 1 evaluation failed: {e}")
             print(traceback.format_exc())
-            return {
-                "runs_successfully": 0.0, 
-                "combined_score": 0.0,
-                "error": str(e)
+            
+            error_artifacts = {
+                "error_type": type(e).__name__,
+                "error_message": f"Stage 1: {str(e)}",
+                "full_traceback": traceback.format_exc(),
+                "suggestion": "Unexpected error occurred. Check the traceback for specific issue"
             }
+            
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": str(e)
+                },
+                artifacts=error_artifacts
+            )
 
     except Exception as e:
         print(f"Stage 1 evaluation failed: {e}")
         print(traceback.format_exc())
-        return {
-            "runs_successfully": 0.0, 
-            "combined_score": 0.0,
-            "error": str(e)
+        
+        error_artifacts = {
+            "error_type": type(e).__name__,
+            "error_message": f"Stage 1 outer exception: {str(e)}",
+            "full_traceback": traceback.format_exc(),
+            "suggestion": "Critical error during stage 1 evaluation. Check program syntax and imports"
         }
+        
+        return EvaluationResult(
+            metrics={
+                "runs_successfully": 0.0, 
+                "combined_score": 0.0,
+                "error": str(e)
+            },
+            artifacts=error_artifacts
+        )
 
 
 def evaluate_stage2(program_path):