@@ -117,21 +117,12 @@ async def evaluate_program(
117117 # Retry logic for evaluation
118118 last_exception = None
119119 for attempt in range (self .config .max_retries + 1 ):
120- # Create a temporary file for the program - FIXED: proper file handling
121- temp_file_path = None
122- try :
123- # Create temp file and write content with proper flushing
124- temp_fd , temp_file_path = tempfile .mkstemp (suffix = ".py" , text = True )
125- with os .fdopen (temp_fd , 'w' ) as temp_file :
126- temp_file .write (program_code )
127- temp_file .flush () # Ensure content is written to disk
128- os .fsync (temp_file .fileno ()) # Force sync to disk
129-
130- # Verify file was written correctly (debug)
131- with open (temp_file_path , 'r' ) as verify_file :
132- written_content = verify_file .read ()
133- logger .debug (f"Temp file content (first 100 chars): { written_content [:100 ]} " )
120+ # Create a temporary file for the program
121+ with tempfile .NamedTemporaryFile (suffix = ".py" , delete = False ) as temp_file :
122+ temp_file .write (program_code .encode ("utf-8" ))
123+ temp_file_path = temp_file .name
134124
125+ try :
135126 # Run evaluation
136127 if self .config .cascade_evaluation :
137128 # Run cascade evaluation
@@ -195,49 +186,28 @@ async def evaluate_program(
195186 )
196187 traceback .print_exc ()
197188
198- # Capture failure artifacts if enabled - FIXED: better artifact capture
189+ # Capture failure artifacts if enabled
199190 if artifacts_enabled and program_id :
200- failure_artifacts = {
191+ self . _pending_artifacts [ program_id ] = {
201192 "stderr" : str (e ),
202193 "traceback" : traceback .format_exc (),
203194 "failure_stage" : "evaluation" ,
204- "attempt" : attempt + 1 ,
205- "timeout_config" : self .config .timeout ,
206195 }
207-
208- # Check if this was a timeout error
209- if isinstance (e , asyncio .TimeoutError ) or "timeout" in str (e ).lower ():
210- failure_artifacts ["timeout" ] = True
211- failure_artifacts ["failure_stage" ] = "timeout"
212-
213- # Store or update artifacts
214- if program_id in self ._pending_artifacts :
215- self ._pending_artifacts [program_id ].update (failure_artifacts )
216- else :
217- self ._pending_artifacts [program_id ] = failure_artifacts
218196
219197 # If this is not the last attempt, wait a bit before retrying
220198 if attempt < self .config .max_retries :
221199 await asyncio .sleep (1.0 ) # Wait 1 second before retry
222200
223201 finally :
224202 # Clean up temporary file
225- if temp_file_path and os .path .exists (temp_file_path ):
226- try :
227- os .unlink (temp_file_path )
228- except OSError :
229- pass # Ignore cleanup errors
203+ if os .path .exists (temp_file_path ):
204+ os .unlink (temp_file_path )
230205
231- # All retries failed - FIXED: better error return with timeout info
206+ # All retries failed
232207 logger .error (
233208 f"All evaluation attempts failed for program{ program_id_str } . Last error: { str (last_exception )} "
234209 )
235-
236- # Check if the last exception was a timeout
237- if isinstance (last_exception , asyncio .TimeoutError ):
238- return {"error" : 0.0 , "timeout" : True }
239- else :
240- return {"error" : 0.0 }
210+ return {"error" : 0.0 }
241211
242212 def _process_evaluation_result (self , result : Any ) -> EvaluationResult :
243213 """
@@ -282,35 +252,27 @@ async def _direct_evaluate(self, program_path: str) -> Dict[str, float]:
282252 Returns:
283253 Dictionary of metric name to score
284254 """
285- logger .debug (f"Starting direct evaluation with timeout={ self .config .timeout } s" )
286-
287255 try :
288256 # Create a coroutine that runs the evaluation function in an executor
289257 async def run_evaluation ():
290258 loop = asyncio .get_event_loop ()
291- logger .debug (f"Running evaluation function on { program_path } " )
292- result = await loop .run_in_executor (None , self .evaluate_function , program_path )
293- logger .debug (f"Evaluation function returned: { result } " )
294- return result
259+ return await loop .run_in_executor (None , self .evaluate_function , program_path )
295260
296261 # Run the evaluation with timeout
297- logger .debug (f"Waiting for evaluation with { self .config .timeout } s timeout" )
298262 result = await asyncio .wait_for (run_evaluation (), timeout = self .config .timeout )
299263
300264 # Validate result
301265 if not isinstance (result , dict ):
302266 logger .warning (f"Evaluation returned non-dictionary result: { result } " )
303267 return {"error" : 0.0 }
304268
305- logger .debug (f"Evaluation completed successfully: { result } " )
306269 return result
307270
308271 except asyncio .TimeoutError :
309272 logger .warning (f"Evaluation timed out after { self .config .timeout } s" )
310273 return {"error" : 0.0 , "timeout" : True }
311274 except Exception as e :
312275 logger .error (f"Error in direct evaluation: { str (e )} " )
313- traceback .print_exc ()
314276 return {"error" : 0.0 }
315277
316278 async def _cascade_evaluate (
@@ -346,6 +308,7 @@ async def _cascade_evaluate(
346308
347309 # Run first stage with timeout
348310 try :
311+
349312 async def run_stage1 ():
350313 loop = asyncio .get_event_loop ()
351314 return await loop .run_in_executor (None , module .evaluate_stage1 , program_path )
@@ -385,6 +348,7 @@ async def run_stage1():
385348
386349 # Run second stage with timeout
387350 try :
351+
388352 async def run_stage2 ():
389353 loop = asyncio .get_event_loop ()
390354 return await loop .run_in_executor (None , module .evaluate_stage2 , program_path )
@@ -446,6 +410,7 @@ async def run_stage2():
446410
447411 # Run third stage with timeout
448412 try :
413+
449414 async def run_stage3 ():
450415 loop = asyncio .get_event_loop ()
451416 return await loop .run_in_executor (None , module .evaluate_stage3 , program_path )
0 commit comments