Skip to content

Commit 545557a

Browse files
committed
Add cascade evaluation config validation and update YAML
Added validation of cascade evaluation configuration in Evaluator to warn if cascade functions are missing or incomplete. Updated config.yaml to set cascade_evaluation to false, reflecting that evaluator does not implement cascade functions. Improved _direct_evaluate to support both dict and EvaluationResult returns.
1 parent 1ee7110 commit 545557a

File tree

2 files changed

+38
-13
lines changed

2 files changed

+38
-13
lines changed

examples/rust_adaptive_sort/config.yaml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,5 @@ evaluator:
4949
timeout: 60 # Rust compilation can take time
5050
parallel_evaluations: 3
5151

52-
# Use cascade evaluation for performance testing
53-
cascade_evaluation: true
54-
cascade_thresholds:
55-
- 0.5 # Compilation success and basic correctness
56-
- 0.7 # Good performance
57-
- 0.85 # Excellent adaptability
52+
# Direct evaluation - evaluator doesn't implement cascade functions
53+
cascade_evaluation: false

openevolve/evaluator.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,42 @@ def _load_evaluation_function(self) -> None:
8989

9090
self.evaluate_function = module.evaluate
9191
logger.info(f"Successfully loaded evaluation function from {self.evaluation_file}")
92+
93+
# Validate cascade configuration
94+
self._validate_cascade_configuration(module)
9295
except Exception as e:
9396
logger.error(f"Error loading evaluation function: {str(e)}")
9497
raise
9598

99+
def _validate_cascade_configuration(self, module) -> None:
100+
"""
101+
Validate cascade evaluation configuration and warn about potential issues
102+
103+
Args:
104+
module: The loaded evaluation module
105+
"""
106+
if self.config.cascade_evaluation:
107+
# Check if cascade functions exist
108+
has_stage1 = hasattr(module, "evaluate_stage1")
109+
has_stage2 = hasattr(module, "evaluate_stage2")
110+
has_stage3 = hasattr(module, "evaluate_stage3")
111+
112+
if not has_stage1:
113+
logger.warning(
114+
f"Configuration has 'cascade_evaluation: true' but evaluator "
115+
f"'{self.evaluation_file}' does not define 'evaluate_stage1' function. "
116+
f"This will fall back to direct evaluation, making the cascade setting useless. "
117+
f"Consider setting 'cascade_evaluation: false' or implementing cascade functions."
118+
)
119+
elif not (has_stage2 or has_stage3):
120+
logger.warning(
121+
f"Evaluator '{self.evaluation_file}' defines 'evaluate_stage1' but no additional "
122+
f"cascade stages (evaluate_stage2, evaluate_stage3). Consider implementing "
123+
f"multi-stage evaluation for better cascade benefits."
124+
)
125+
else:
126+
logger.debug(f"Cascade evaluation properly configured with available stage functions")
127+
96128
async def evaluate_program(
97129
self,
98130
program_code: str,
@@ -273,15 +305,15 @@ def get_pending_artifacts(self, program_id: str) -> Optional[Dict[str, Union[str
273305
"""
274306
return self._pending_artifacts.pop(program_id, None)
275307

276-
async def _direct_evaluate(self, program_path: str) -> Dict[str, float]:
308+
async def _direct_evaluate(self, program_path: str) -> Union[Dict[str, float], EvaluationResult]:
277309
"""
278310
Directly evaluate a program using the evaluation function with timeout
279311
280312
Args:
281313
program_path: Path to the program file
282314
283315
Returns:
284-
Dictionary of metric name to score
316+
Dictionary of metrics or EvaluationResult with metrics and artifacts
285317
286318
Raises:
287319
asyncio.TimeoutError: If evaluation exceeds timeout
@@ -296,11 +328,8 @@ async def run_evaluation():
296328
# Run the evaluation with timeout - let exceptions bubble up for retry handling
297329
result = await asyncio.wait_for(run_evaluation(), timeout=self.config.timeout)
298330

299-
# Validate result
300-
if not isinstance(result, dict):
301-
logger.warning(f"Evaluation returned non-dictionary result: {result}")
302-
return {"error": 0.0}
303-
331+
# Return result as-is to be processed by _process_evaluation_result
332+
# This supports both dict and EvaluationResult returns, just like _cascade_evaluate
304333
return result
305334

306335
async def _cascade_evaluate(

0 commit comments

Comments
 (0)