1313import time
1414import traceback
1515import uuid
16+ import warnings
1617from pathlib import Path
17- from typing import Any , Callable , Dict , List , Optional , Tuple , Union
18+ from typing import Any , Dict , List , Optional , Tuple , Union , Protocol , cast
1819import traceback
1920
2021from openevolve .config import EvaluatorConfig
2930logger = logging .getLogger (__name__ )
3031
3132
33+ class EvaluationObject (Protocol ):
34+ def evaluate (self , program_path : str ) -> EvaluationResult :
35+ ...
36+
37+
38+ class CascadeEvaluationObject (Protocol ):
39+ def evaluate_stage1 (self , program_path : str ) -> EvaluationResult :
40+ ...
41+
42+ def evaluate_stage2 (self , program_path : str ) -> EvaluationResult :
43+ ...
44+
45+ def evaluate_stage3 (self , program_path : str ) -> EvaluationResult :
46+ ...
47+
48+
3249class Evaluator :
3350 """
3451 Evaluates programs and assigns scores
@@ -41,21 +58,26 @@ def __init__(
4158 self ,
4259 config : EvaluatorConfig ,
4360 evaluation_file : str ,
61+ evaluation_object : Optional [EvaluationObject ] = None ,
4462 llm_ensemble : Optional [LLMEnsemble ] = None ,
4563 prompt_sampler : Optional [PromptSampler ] = None ,
4664 database : Optional [ProgramDatabase ] = None ,
4765 ):
66+ if evaluation_file and evaluation_object :
67+ warnings .warn ("Both evaluation_file and evaluation_object provided - evaluation_object overrides evaluation_file" )
4868 self .config = config
4969 self .evaluation_file = evaluation_file
70+ self .evaluation_object = evaluation_object
5071 self .llm_ensemble = llm_ensemble
5172 self .prompt_sampler = prompt_sampler
5273 self .database = database
5374
5475 # Create a task pool for parallel evaluation
5576 self .task_pool = TaskPool (max_concurrency = config .parallel_evaluations )
5677
57- # Set up evaluation function if file exists
58- self ._load_evaluation_function ()
78+ if self .evaluation_object is None :
79+ # Set up evaluation module if file exists
80+ self ._load_evaluation_function ()
5981
6082 # Pending artifacts storage for programs
6183 self ._pending_artifacts : Dict [str , Dict [str , Union [str , bytes ]]] = {}
@@ -87,7 +109,7 @@ def _load_evaluation_function(self) -> None:
87109 f"Evaluation file { self .evaluation_file } does not contain an 'evaluate' function"
88110 )
89111
90- self .evaluate_function = module .evaluate
112+ self .evaluation_object = module .evaluate
91113 logger .info (f"Successfully loaded evaluation function from { self .evaluation_file } " )
92114
93115 # Validate cascade configuration
@@ -346,7 +368,7 @@ async def _direct_evaluate(
346368 # Create a coroutine that runs the evaluation function in an executor
347369 async def run_evaluation ():
348370 loop = asyncio .get_event_loop ()
349- return await loop .run_in_executor (None , self .evaluate_function , program_path )
371+ return await loop .run_in_executor (None , self .evaluation_object . evaluate , program_path )
350372
351373 # Run the evaluation with timeout - let exceptions bubble up for retry handling
352374 result = await asyncio .wait_for (run_evaluation (), timeout = self .config .timeout )
@@ -367,31 +389,19 @@ async def _cascade_evaluate(
367389 Returns:
368390 Dictionary of metrics or EvaluationResult with metrics and artifacts
369391 """
370- # Import the evaluation module to get cascade functions if they exist
392+ # This cast just makes static type checkers happy; actual checking is still done using hasattr
393+ evaluation_object = cast (CascadeEvaluationObject , self .evaluation_object )
371394 try :
372- # Add the evaluation file's directory to Python path so it can import local modules
373- eval_dir = os .path .dirname (os .path .abspath (self .evaluation_file ))
374- if eval_dir not in sys .path :
375- sys .path .insert (0 , eval_dir )
376- logger .debug (f"Added { eval_dir } to Python path for cascade evaluation" )
377-
378- spec = importlib .util .spec_from_file_location ("evaluation_module" , self .evaluation_file )
379- if spec is None or spec .loader is None :
380- return await self ._direct_evaluate (program_path )
381-
382- module = importlib .util .module_from_spec (spec )
383- spec .loader .exec_module (module )
384-
385395 # Check if cascade functions exist
386- if not hasattr (module , "evaluate_stage1" ):
396+ if not hasattr (evaluation_object , "evaluate_stage1" ):
387397 return await self ._direct_evaluate (program_path )
388398
389399 # Run first stage with timeout
390400 try :
391401
392402 async def run_stage1 ():
393403 loop = asyncio .get_event_loop ()
394- return await loop .run_in_executor (None , module .evaluate_stage1 , program_path )
404+ return await loop .run_in_executor (None , evaluation_object .evaluate_stage1 , program_path )
395405
396406 stage1_result = await asyncio .wait_for (run_stage1 (), timeout = self .config .timeout )
397407 stage1_eval_result = self ._process_evaluation_result (stage1_result )
@@ -424,15 +434,15 @@ async def run_stage1():
424434 return stage1_eval_result
425435
426436 # Check if second stage exists
427- if not hasattr (module , "evaluate_stage2" ):
437+ if not hasattr (evaluation_object , "evaluate_stage2" ):
428438 return stage1_eval_result
429439
430440 # Run second stage with timeout
431441 try :
432442
433443 async def run_stage2 ():
434444 loop = asyncio .get_event_loop ()
435- return await loop .run_in_executor (None , module .evaluate_stage2 , program_path )
445+ return await loop .run_in_executor (None , evaluation_object .evaluate_stage2 , program_path )
436446
437447 stage2_result = await asyncio .wait_for (run_stage2 (), timeout = self .config .timeout )
438448 stage2_eval_result = self ._process_evaluation_result (stage2_result )
@@ -486,15 +496,15 @@ async def run_stage2():
486496 return merged_result
487497
488498 # Check if third stage exists
489- if not hasattr (module , "evaluate_stage3" ):
499+ if not hasattr (evaluation_object , "evaluate_stage3" ):
490500 return merged_result
491501
492502 # Run third stage with timeout
493503 try :
494504
495505 async def run_stage3 ():
496506 loop = asyncio .get_event_loop ()
497- return await loop .run_in_executor (None , module .evaluate_stage3 , program_path )
507+ return await loop .run_in_executor (None , evaluation_object .evaluate_stage3 , program_path )
498508
499509 stage3_result = await asyncio .wait_for (run_stage3 (), timeout = self .config .timeout )
500510 stage3_eval_result = self ._process_evaluation_result (stage3_result )
0 commit comments