Add cascade evaluation config validation and update YAML

codelion · codelion · commit 545557ae05ec · 2025-07-11T10:53:21.000+08:00
Added validation of cascade evaluation configuration in Evaluator to warn if cascade functions are missing or incomplete. Updated config.yaml to set cascade_evaluation to false, reflecting that evaluator does not implement cascade functions. Improved _direct_evaluate to support both dict and EvaluationResult returns.
diff --git a/examples/rust_adaptive_sort/config.yaml b/examples/rust_adaptive_sort/config.yaml
@@ -49,9 +49,5 @@ evaluator:
   timeout: 60  # Rust compilation can take time
   parallel_evaluations: 3
   
-  # Use cascade evaluation for performance testing
-  cascade_evaluation: true
-  cascade_thresholds:
-    - 0.5  # Compilation success and basic correctness
-    - 0.7  # Good performance
-    - 0.85 # Excellent adaptability
+  # Direct evaluation - evaluator doesn't implement cascade functions
+  cascade_evaluation: false
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
@@ -89,10 +89,42 @@ def _load_evaluation_function(self) -> None:
 
             self.evaluate_function = module.evaluate
             logger.info(f"Successfully loaded evaluation function from {self.evaluation_file}")
+            
+            # Validate cascade configuration
+            self._validate_cascade_configuration(module)
         except Exception as e:
             logger.error(f"Error loading evaluation function: {str(e)}")
             raise
 
+    def _validate_cascade_configuration(self, module) -> None:
+        """
+        Validate cascade evaluation configuration and warn about potential issues
+        
+        Args:
+            module: The loaded evaluation module
+        """
+        if self.config.cascade_evaluation:
+            # Check if cascade functions exist
+            has_stage1 = hasattr(module, "evaluate_stage1")
+            has_stage2 = hasattr(module, "evaluate_stage2") 
+            has_stage3 = hasattr(module, "evaluate_stage3")
+            
+            if not has_stage1:
+                logger.warning(
+                    f"Configuration has 'cascade_evaluation: true' but evaluator "
+                    f"'{self.evaluation_file}' does not define 'evaluate_stage1' function. "
+                    f"This will fall back to direct evaluation, making the cascade setting useless. "
+                    f"Consider setting 'cascade_evaluation: false' or implementing cascade functions."
+                )
+            elif not (has_stage2 or has_stage3):
+                logger.warning(
+                    f"Evaluator '{self.evaluation_file}' defines 'evaluate_stage1' but no additional "
+                    f"cascade stages (evaluate_stage2, evaluate_stage3). Consider implementing "
+                    f"multi-stage evaluation for better cascade benefits."
+                )
+            else:
+                logger.debug(f"Cascade evaluation properly configured with available stage functions")
+
     async def evaluate_program(
         self,
         program_code: str,
@@ -273,15 +305,15 @@ def get_pending_artifacts(self, program_id: str) -> Optional[Dict[str, Union[str
         """
         return self._pending_artifacts.pop(program_id, None)
 
-    async def _direct_evaluate(self, program_path: str) -> Dict[str, float]:
+    async def _direct_evaluate(self, program_path: str) -> Union[Dict[str, float], EvaluationResult]:
         """
         Directly evaluate a program using the evaluation function with timeout
 
         Args:
             program_path: Path to the program file
 
         Returns:
-            Dictionary of metric name to score
+            Dictionary of metrics or EvaluationResult with metrics and artifacts
 
         Raises:
             asyncio.TimeoutError: If evaluation exceeds timeout
@@ -296,11 +328,8 @@ async def run_evaluation():
         # Run the evaluation with timeout - let exceptions bubble up for retry handling
         result = await asyncio.wait_for(run_evaluation(), timeout=self.config.timeout)
 
-        # Validate result
-        if not isinstance(result, dict):
-            logger.warning(f"Evaluation returned non-dictionary result: {result}")
-            return {"error": 0.0}
-
+        # Return result as-is to be processed by _process_evaluation_result
+        # This supports both dict and EvaluationResult returns, just like _cascade_evaluate
         return result
 
     async def _cascade_evaluate(