@@ -89,10 +89,42 @@ def _load_evaluation_function(self) -> None:
8989
9090 self .evaluate_function = module .evaluate
9191 logger .info (f"Successfully loaded evaluation function from { self .evaluation_file } " )
92+
93+ # Validate cascade configuration
94+ self ._validate_cascade_configuration (module )
9295 except Exception as e :
9396 logger .error (f"Error loading evaluation function: { str (e )} " )
9497 raise
9598
99+ def _validate_cascade_configuration (self , module ) -> None :
100+ """
101+ Validate cascade evaluation configuration and warn about potential issues
102+
103+ Args:
104+ module: The loaded evaluation module
105+ """
106+ if self .config .cascade_evaluation :
107+ # Check if cascade functions exist
108+ has_stage1 = hasattr (module , "evaluate_stage1" )
109+ has_stage2 = hasattr (module , "evaluate_stage2" )
110+ has_stage3 = hasattr (module , "evaluate_stage3" )
111+
112+ if not has_stage1 :
113+ logger .warning (
114+ f"Configuration has 'cascade_evaluation: true' but evaluator "
115+ f"'{ self .evaluation_file } ' does not define 'evaluate_stage1' function. "
116+ f"This will fall back to direct evaluation, making the cascade setting useless. "
117+ f"Consider setting 'cascade_evaluation: false' or implementing cascade functions."
118+ )
119+ elif not (has_stage2 or has_stage3 ):
120+ logger .warning (
121+ f"Evaluator '{ self .evaluation_file } ' defines 'evaluate_stage1' but no additional "
122+ f"cascade stages (evaluate_stage2, evaluate_stage3). Consider implementing "
123+ f"multi-stage evaluation for better cascade benefits."
124+ )
125+ else :
126+ logger .debug (f"Cascade evaluation properly configured with available stage functions" )
127+
96128 async def evaluate_program (
97129 self ,
98130 program_code : str ,
@@ -273,15 +305,15 @@ def get_pending_artifacts(self, program_id: str) -> Optional[Dict[str, Union[str
273305 """
274306 return self ._pending_artifacts .pop (program_id , None )
275307
276- async def _direct_evaluate (self , program_path : str ) -> Dict [str , float ]:
308+ async def _direct_evaluate (self , program_path : str ) -> Union [ Dict [str , float ], EvaluationResult ]:
277309 """
278310 Directly evaluate a program using the evaluation function with timeout
279311
280312 Args:
281313 program_path: Path to the program file
282314
283315 Returns:
284- Dictionary of metric name to score
316+ Dictionary of metrics or EvaluationResult with metrics and artifacts
285317
286318 Raises:
287319 asyncio.TimeoutError: If evaluation exceeds timeout
@@ -296,11 +328,8 @@ async def run_evaluation():
296328 # Run the evaluation with timeout - let exceptions bubble up for retry handling
297329 result = await asyncio .wait_for (run_evaluation (), timeout = self .config .timeout )
298330
299- # Validate result
300- if not isinstance (result , dict ):
301- logger .warning (f"Evaluation returned non-dictionary result: { result } " )
302- return {"error" : 0.0 }
303-
331+ # Return result as-is to be processed by _process_evaluation_result
332+ # This supports both dict and EvaluationResult returns, just like _cascade_evaluate
304333 return result
305334
306335 async def _cascade_evaluate (
0 commit comments