@@ -117,12 +117,21 @@ async def evaluate_program(
117117 # Retry logic for evaluation
118118 last_exception = None
119119 for attempt in range (self .config .max_retries + 1 ):
120- # Create a temporary file for the program
121- with tempfile .NamedTemporaryFile (suffix = ".py" , delete = False ) as temp_file :
122- temp_file .write (program_code .encode ("utf-8" ))
123- temp_file_path = temp_file .name
124-
120+ # Create a temporary file for the program - FIXED: proper file handling
121+ temp_file_path = None
125122 try :
123+ # Create temp file and write content with proper flushing
124+ temp_fd , temp_file_path = tempfile .mkstemp (suffix = ".py" , text = True )
125+ with os .fdopen (temp_fd , 'w' ) as temp_file :
126+ temp_file .write (program_code )
127+ temp_file .flush () # Ensure content is written to disk
128+ os .fsync (temp_file .fileno ()) # Force sync to disk
129+
130+ # Verify file was written correctly (debug)
131+ with open (temp_file_path , 'r' ) as verify_file :
132+ written_content = verify_file .read ()
133+ logger .debug (f"Temp file content (first 100 chars): { written_content [:100 ]} " )
134+
126135 # Run evaluation
127136 if self .config .cascade_evaluation :
128137 # Run cascade evaluation
@@ -186,28 +195,49 @@ async def evaluate_program(
186195 )
187196 traceback .print_exc ()
188197
189- # Capture failure artifacts if enabled
198+ # Capture failure artifacts if enabled - FIXED: better artifact capture
190199 if artifacts_enabled and program_id :
191- self . _pending_artifacts [ program_id ] = {
200+ failure_artifacts = {
192201 "stderr" : str (e ),
193202 "traceback" : traceback .format_exc (),
194203 "failure_stage" : "evaluation" ,
204+ "attempt" : attempt + 1 ,
205+ "timeout_config" : self .config .timeout ,
195206 }
207+
208+ # Check if this was a timeout error
209+ if isinstance (e , asyncio .TimeoutError ) or "timeout" in str (e ).lower ():
210+ failure_artifacts ["timeout" ] = True
211+ failure_artifacts ["failure_stage" ] = "timeout"
212+
213+ # Store or update artifacts
214+ if program_id in self ._pending_artifacts :
215+ self ._pending_artifacts [program_id ].update (failure_artifacts )
216+ else :
217+ self ._pending_artifacts [program_id ] = failure_artifacts
196218
197219 # If this is not the last attempt, wait a bit before retrying
198220 if attempt < self .config .max_retries :
199221 await asyncio .sleep (1.0 ) # Wait 1 second before retry
200222
201223 finally :
202224 # Clean up temporary file
203- if os .path .exists (temp_file_path ):
204- os .unlink (temp_file_path )
225+ if temp_file_path and os .path .exists (temp_file_path ):
226+ try :
227+ os .unlink (temp_file_path )
228+ except OSError :
229+ pass # Ignore cleanup errors
205230
206- # All retries failed
231+ # All retries failed - FIXED: better error return with timeout info
207232 logger .error (
208233 f"All evaluation attempts failed for program{ program_id_str } . Last error: { str (last_exception )} "
209234 )
210- return {"error" : 0.0 }
235+
236+ # Check if the last exception was a timeout
237+ if isinstance (last_exception , asyncio .TimeoutError ):
238+ return {"error" : 0.0 , "timeout" : True }
239+ else :
240+ return {"error" : 0.0 }
211241
212242 def _process_evaluation_result (self , result : Any ) -> EvaluationResult :
213243 """
@@ -252,27 +282,35 @@ async def _direct_evaluate(self, program_path: str) -> Dict[str, float]:
252282 Returns:
253283 Dictionary of metric name to score
254284 """
285+ logger .debug (f"Starting direct evaluation with timeout={ self .config .timeout } s" )
286+
255287 try :
256288 # Create a coroutine that runs the evaluation function in an executor
257289 async def run_evaluation ():
258290 loop = asyncio .get_event_loop ()
259- return await loop .run_in_executor (None , self .evaluate_function , program_path )
291+ logger .debug (f"Running evaluation function on { program_path } " )
292+ result = await loop .run_in_executor (None , self .evaluate_function , program_path )
293+ logger .debug (f"Evaluation function returned: { result } " )
294+ return result
260295
261296 # Run the evaluation with timeout
297+ logger .debug (f"Waiting for evaluation with { self .config .timeout } s timeout" )
262298 result = await asyncio .wait_for (run_evaluation (), timeout = self .config .timeout )
263299
264300 # Validate result
265301 if not isinstance (result , dict ):
266302 logger .warning (f"Evaluation returned non-dictionary result: { result } " )
267303 return {"error" : 0.0 }
268304
305+ logger .debug (f"Evaluation completed successfully: { result } " )
269306 return result
270307
271308 except asyncio .TimeoutError :
272309 logger .warning (f"Evaluation timed out after { self .config .timeout } s" )
273310 return {"error" : 0.0 , "timeout" : True }
274311 except Exception as e :
275312 logger .error (f"Error in direct evaluation: { str (e )} " )
313+ traceback .print_exc ()
276314 return {"error" : 0.0 }
277315
278316 async def _cascade_evaluate (
@@ -308,7 +346,6 @@ async def _cascade_evaluate(
308346
309347 # Run first stage with timeout
310348 try :
311-
312349 async def run_stage1 ():
313350 loop = asyncio .get_event_loop ()
314351 return await loop .run_in_executor (None , module .evaluate_stage1 , program_path )
@@ -348,7 +385,6 @@ async def run_stage1():
348385
349386 # Run second stage with timeout
350387 try :
351-
352388 async def run_stage2 ():
353389 loop = asyncio .get_event_loop ()
354390 return await loop .run_in_executor (None , module .evaluate_stage2 , program_path )
@@ -410,7 +446,6 @@ async def run_stage2():
410446
411447 # Run third stage with timeout
412448 try :
413-
414449 async def run_stage3 ():
415450 loop = asyncio .get_event_loop ()
416451 return await loop .run_in_executor (None , module .evaluate_stage3 , program_path )
0 commit comments