@@ -89,10 +89,42 @@ def _load_evaluation_function(self) -> None:
8989
9090 self .evaluate_function = module .evaluate
9191 logger .info (f"Successfully loaded evaluation function from { self .evaluation_file } " )
92+
93+ # Validate cascade configuration
94+ self ._validate_cascade_configuration (module )
9295 except Exception as e :
9396 logger .error (f"Error loading evaluation function: { str (e )} " )
9497 raise
9598
99+ def _validate_cascade_configuration (self , module ) -> None :
100+ """
101+ Validate cascade evaluation configuration and warn about potential issues
102+
103+ Args:
104+ module: The loaded evaluation module
105+ """
106+ if self .config .cascade_evaluation :
107+ # Check if cascade functions exist
108+ has_stage1 = hasattr (module , "evaluate_stage1" )
109+ has_stage2 = hasattr (module , "evaluate_stage2" )
110+ has_stage3 = hasattr (module , "evaluate_stage3" )
111+
112+ if not has_stage1 :
113+ logger .warning (
114+ f"Configuration has 'cascade_evaluation: true' but evaluator "
115+ f"'{ self .evaluation_file } ' does not define 'evaluate_stage1' function. "
116+ f"This will fall back to direct evaluation, making the cascade setting useless. "
117+ f"Consider setting 'cascade_evaluation: false' or implementing cascade functions."
118+ )
119+ elif not (has_stage2 or has_stage3 ):
120+ logger .warning (
121+ f"Evaluator '{ self .evaluation_file } ' defines 'evaluate_stage1' but no additional "
122+ f"cascade stages (evaluate_stage2, evaluate_stage3). Consider implementing "
123+ f"multi-stage evaluation for better cascade benefits."
124+ )
125+ else :
126+ logger .debug (f"Cascade evaluation properly configured with available stage functions" )
127+
96128 async def evaluate_program (
97129 self ,
98130 program_code : str ,
@@ -273,15 +305,15 @@ def get_pending_artifacts(self, program_id: str) -> Optional[Dict[str, Union[str
273305 """
274306 return self ._pending_artifacts .pop (program_id , None )
275307
276- async def _direct_evaluate (self , program_path : str ) -> Dict [str , float ]:
308+ async def _direct_evaluate (self , program_path : str ) -> Union [ Dict [str , float ], EvaluationResult ]:
277309 """
278310 Directly evaluate a program using the evaluation function with timeout
279311
280312 Args:
281313 program_path: Path to the program file
282314
283315 Returns:
284- Dictionary of metric name to score
316+ Dictionary of metrics or EvaluationResult with metrics and artifacts
285317
286318 Raises:
287319 asyncio.TimeoutError: If evaluation exceeds timeout
@@ -296,11 +328,8 @@ async def run_evaluation():
296328 # Run the evaluation with timeout - let exceptions bubble up for retry handling
297329 result = await asyncio .wait_for (run_evaluation (), timeout = self .config .timeout )
298330
299- # Validate result
300- if not isinstance (result , dict ):
301- logger .warning (f"Evaluation returned non-dictionary result: { result } " )
302- return {"error" : 0.0 }
303-
331+ # Return result as-is to be processed by _process_evaluation_result
332+ # This supports both dict and EvaluationResult returns, just like _cascade_evaluate
304333 return result
305334
306335 async def _cascade_evaluate (
@@ -354,13 +383,14 @@ async def run_stage1():
354383 )
355384 except Exception as e :
356385 logger .error (f"Error in stage 1 evaluation: { str (e )} " )
357- # Capture stage 1 failure as artifacts
386+ # Capture stage 1 failure with enhanced context
387+ error_context = self ._create_cascade_error_context ("stage1" , e )
358388 return EvaluationResult (
359389 metrics = {"stage1_passed" : 0.0 , "error" : 0.0 },
360390 artifacts = {
361391 "stderr" : str (e ),
362392 "traceback" : traceback .format_exc (),
363- "failure_stage" : "stage1" ,
393+ ** error_context ,
364394 },
365395 )
366396
@@ -481,13 +511,14 @@ async def run_stage3():
481511
482512 except Exception as e :
483513 logger .error (f"Error in cascade evaluation: { str (e )} " )
484- # Return proper cascade failure result instead of re-raising
514+ # Return proper cascade failure result with enhanced context
515+ error_context = self ._create_cascade_error_context ("cascade_setup" , e )
485516 return EvaluationResult (
486517 metrics = {"stage1_passed" : 0.0 , "error" : 0.0 },
487518 artifacts = {
488519 "stderr" : str (e ),
489520 "traceback" : traceback .format_exc (),
490- "failure_stage" : "cascade_setup" ,
521+ ** error_context ,
491522 },
492523 )
493524
@@ -582,6 +613,29 @@ async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[s
582613 traceback .print_exc ()
583614 return {}
584615
616+ def _create_cascade_error_context (self , stage : str , error : Exception ) -> dict :
617+ """
618+ Create rich error context for cascade failures
619+
620+ Args:
621+ stage: The stage where the error occurred
622+ error: The exception that was raised
623+
624+ Returns:
625+ Dictionary with enhanced error context
626+ """
627+ import time
628+ return {
629+ "failure_stage" : stage ,
630+ "error_type" : type (error ).__name__ ,
631+ "error_message" : str (error ),
632+ "timestamp" : time .time (),
633+ "cascade_config" : self .config .cascade_evaluation ,
634+ "cascade_thresholds" : getattr (self .config , 'cascade_thresholds' , []),
635+ "timeout_config" : self .config .timeout ,
636+ "evaluation_file" : self .evaluation_file ,
637+ }
638+
585639 def _passes_threshold (self , metrics : Dict [str , float ], threshold : float ) -> bool :
586640 """
587641 Check if metrics pass a threshold
0 commit comments