f

codelion · codelion · commit 08c5b801ae29 · 2025-06-19T16:56:08.000+08:00
diff --git a/examples/mlx_metal_kernel_opt/README.md b/examples/mlx_metal_kernel_opt/README.md
@@ -1,4 +1,4 @@
-# 🎯 Qwen3-0.6B Custom Metal Kernel Optimization with OpenEvolve
+# 🎯Custom Metal Kernel Optimization with OpenEvolve
 
 **Evolving custom GPU kernels for Grouped Query Attention using MLX Metal kernels for Qwen3-0.6B on Apple Silicon**
 
@@ -416,29 +416,3 @@ python run_benchmarks.py --mode compare
 ---
 
 **🎯 This example demonstrates OpenEvolve's capability to discover genuine algorithmic improvements through evolutionary optimization, achieving measurable performance gains on real hardware with production-ready implementations.**
-
-## 🔧 **Recent Improvements**
-
-### **✅ Correct Terminology**
-- **Before**: Incorrect references to "chunked GQA processing"
-- **After**: Accurate descriptions of custom Metal kernel optimization
-- **Benefits**: Technical accuracy and clear understanding of actual discoveries
-
-### **✅ Comprehensive Testing**
-- **Before**: Basic performance measurement
-- **After**: 17-scenario comprehensive benchmark suite with statistical validation
-- **Benefits**: Robust performance analysis and reproducible results
-
-### **✅ Production Integration**
-- **Before**: Standalone optimization experiments
-- **After**: Full MLX-LM integration with seamless switching
-- **Benefits**: Real-world usability and easy adoption
-
-### **✅ Detailed Documentation**
-- **Before**: High-level optimization descriptions  
-- **After**: Complete technical details with actual kernel code snippets
-- **Benefits**: Understanding, reproducibility, and further research
-
----
-
-**🚀 Ready for custom Metal kernel evolution with comprehensive benchmarking and detailed analysis!**
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
@@ -134,6 +134,20 @@ async def evaluate_program(
                 # Process the result based on type
                 eval_result = self._process_evaluation_result(result)
 
+                # Check if this was a timeout and capture artifacts if enabled
+                if artifacts_enabled and program_id and eval_result.metrics.get("timeout") is True:
+                    if program_id not in self._pending_artifacts:
+                        self._pending_artifacts[program_id] = {}
+
+                    self._pending_artifacts[program_id].update(
+                        {
+                            "timeout": True,
+                            "timeout_duration": self.config.timeout,
+                            "failure_stage": "evaluation",
+                            "error_type": "timeout",
+                        }
+                    )
+
                 # Add LLM feedback if configured
                 llm_eval_result = None
                 if self.config.use_llm_feedback and self.llm_ensemble:
@@ -153,7 +167,8 @@ async def evaluate_program(
                     )
                     and program_id
                 ):
-                    self._pending_artifacts[program_id] = {}
+                    if program_id not in self._pending_artifacts:
+                        self._pending_artifacts[program_id] = {}
 
                     # Merge eval_result artifacts with llm artifacts if they exist
                     if eval_result.has_artifacts():
@@ -179,6 +194,21 @@ async def evaluate_program(
                 # Return just metrics for backward compatibility
                 return eval_result.metrics
 
+            except asyncio.TimeoutError:
+                # Handle timeout specially - don't retry, just return timeout result
+                logger.warning(f"Evaluation timed out after {self.config.timeout}s")
+
+                # Capture timeout artifacts if enabled
+                if artifacts_enabled and program_id:
+                    self._pending_artifacts[program_id] = {
+                        "timeout": True,
+                        "timeout_duration": self.config.timeout,
+                        "failure_stage": "evaluation",
+                        "error_type": "timeout",
+                    }
+
+                return {"error": 0.0, "timeout": True}
+
             except Exception as e:
                 last_exception = e
                 logger.warning(
@@ -192,6 +222,7 @@ async def evaluate_program(
                         "stderr": str(e),
                         "traceback": traceback.format_exc(),
                         "failure_stage": "evaluation",
+                        "attempt": attempt + 1,
                     }
 
                 # If this is not the last attempt, wait a bit before retrying
@@ -251,30 +282,27 @@ async def _direct_evaluate(self, program_path: str) -> Dict[str, float]:
 
         Returns:
             Dictionary of metric name to score
-        """
-        try:
-            # Create a coroutine that runs the evaluation function in an executor
-            async def run_evaluation():
-                loop = asyncio.get_event_loop()
-                return await loop.run_in_executor(None, self.evaluate_function, program_path)
 
-            # Run the evaluation with timeout
-            result = await asyncio.wait_for(run_evaluation(), timeout=self.config.timeout)
+        Raises:
+            asyncio.TimeoutError: If evaluation exceeds timeout
+            Exception: If evaluation function raises an exception
+        """
 
-            # Validate result
-            if not isinstance(result, dict):
-                logger.warning(f"Evaluation returned non-dictionary result: {result}")
-                return {"error": 0.0}
+        # Create a coroutine that runs the evaluation function in an executor
+        async def run_evaluation():
+            loop = asyncio.get_event_loop()
+            return await loop.run_in_executor(None, self.evaluate_function, program_path)
 
-            return result
+        # Run the evaluation with timeout - let exceptions bubble up for retry handling
+        result = await asyncio.wait_for(run_evaluation(), timeout=self.config.timeout)
 
-        except asyncio.TimeoutError:
-            logger.warning(f"Evaluation timed out after {self.config.timeout}s")
-            return {"error": 0.0, "timeout": True}
-        except Exception as e:
-            logger.error(f"Error in direct evaluation: {str(e)}")
+        # Validate result
+        if not isinstance(result, dict):
+            logger.warning(f"Evaluation returned non-dictionary result: {result}")
             return {"error": 0.0}
 
+        return result
+
     async def _cascade_evaluate(
         self, program_path: str
     ) -> Union[Dict[str, float], EvaluationResult]:
@@ -286,6 +314,10 @@ async def _cascade_evaluate(
 
         Returns:
             Dictionary of metrics or EvaluationResult with metrics and artifacts
+
+        Raises:
+            asyncio.TimeoutError: If any stage exceeds timeout
+            Exception: If any evaluation stage raises an exception
         """
         # Import the evaluation module to get cascade functions if they exist
         try:
@@ -307,34 +339,12 @@ async def _cascade_evaluate(
                 return await self._direct_evaluate(program_path)
 
             # Run first stage with timeout
-            try:
+            async def run_stage1():
+                loop = asyncio.get_event_loop()
+                return await loop.run_in_executor(None, module.evaluate_stage1, program_path)
 
-                async def run_stage1():
-                    loop = asyncio.get_event_loop()
-                    return await loop.run_in_executor(None, module.evaluate_stage1, program_path)
-
-                stage1_result = await asyncio.wait_for(run_stage1(), timeout=self.config.timeout)
-                stage1_eval_result = self._process_evaluation_result(stage1_result)
-            except asyncio.TimeoutError:
-                logger.warning(f"Stage 1 evaluation timed out after {self.config.timeout}s")
-                return EvaluationResult(
-                    metrics={"stage1_passed": 0.0, "error": 0.0, "timeout": True},
-                    artifacts={
-                        "failure_stage": "stage1",
-                        "timeout": True,
-                    },
-                )
-            except Exception as e:
-                logger.error(f"Error in stage 1 evaluation: {str(e)}")
-                # Capture stage 1 failure as artifacts
-                return EvaluationResult(
-                    metrics={"stage1_passed": 0.0, "error": 0.0},
-                    artifacts={
-                        "stderr": str(e),
-                        "traceback": traceback.format_exc(),
-                        "failure_stage": "stage1",
-                    },
-                )
+            stage1_result = await asyncio.wait_for(run_stage1(), timeout=self.config.timeout)
+            stage1_eval_result = self._process_evaluation_result(stage1_result)
 
             # Check threshold
             if not self._passes_threshold(
@@ -347,38 +357,12 @@ async def run_stage1():
                 return stage1_eval_result
 
             # Run second stage with timeout
-            try:
-
-                async def run_stage2():
-                    loop = asyncio.get_event_loop()
-                    return await loop.run_in_executor(None, module.evaluate_stage2, program_path)
+            async def run_stage2():
+                loop = asyncio.get_event_loop()
+                return await loop.run_in_executor(None, module.evaluate_stage2, program_path)
 
-                stage2_result = await asyncio.wait_for(run_stage2(), timeout=self.config.timeout)
-                stage2_eval_result = self._process_evaluation_result(stage2_result)
-            except asyncio.TimeoutError:
-                logger.warning(f"Stage 2 evaluation timed out after {self.config.timeout}s")
-                # Capture stage 2 failure, but keep stage 1 results
-                stage1_eval_result.artifacts.update(
-                    {
-                        "stage2_timeout": True,
-                        "failure_stage": "stage2",
-                    }
-                )
-                stage1_eval_result.metrics["stage2_passed"] = 0.0
-                stage1_eval_result.metrics["timeout"] = True
-                return stage1_eval_result
-            except Exception as e:
-                logger.error(f"Error in stage 2 evaluation: {str(e)}")
-                # Capture stage 2 failure, but keep stage 1 results
-                stage1_eval_result.artifacts.update(
-                    {
-                        "stage2_stderr": str(e),
-                        "stage2_traceback": traceback.format_exc(),
-                        "failure_stage": "stage2",
-                    }
-                )
-                stage1_eval_result.metrics["stage2_passed"] = 0.0
-                return stage1_eval_result
+            stage2_result = await asyncio.wait_for(run_stage2(), timeout=self.config.timeout)
+            stage2_eval_result = self._process_evaluation_result(stage2_result)
 
             # Merge results from stage 1 and 2
             merged_metrics = {}
@@ -409,38 +393,12 @@ async def run_stage2():
                 return merged_result
 
             # Run third stage with timeout
-            try:
+            async def run_stage3():
+                loop = asyncio.get_event_loop()
+                return await loop.run_in_executor(None, module.evaluate_stage3, program_path)
 
-                async def run_stage3():
-                    loop = asyncio.get_event_loop()
-                    return await loop.run_in_executor(None, module.evaluate_stage3, program_path)
-
-                stage3_result = await asyncio.wait_for(run_stage3(), timeout=self.config.timeout)
-                stage3_eval_result = self._process_evaluation_result(stage3_result)
-            except asyncio.TimeoutError:
-                logger.warning(f"Stage 3 evaluation timed out after {self.config.timeout}s")
-                # Capture stage 3 failure, but keep previous results
-                merged_result.artifacts.update(
-                    {
-                        "stage3_timeout": True,
-                        "failure_stage": "stage3",
-                    }
-                )
-                merged_result.metrics["stage3_passed"] = 0.0
-                merged_result.metrics["timeout"] = True
-                return merged_result
-            except Exception as e:
-                logger.error(f"Error in stage 3 evaluation: {str(e)}")
-                # Capture stage 3 failure, but keep previous results
-                merged_result.artifacts.update(
-                    {
-                        "stage3_stderr": str(e),
-                        "stage3_traceback": traceback.format_exc(),
-                        "failure_stage": "stage3",
-                    }
-                )
-                merged_result.metrics["stage3_passed"] = 0.0
-                return merged_result
+            stage3_result = await asyncio.wait_for(run_stage3(), timeout=self.config.timeout)
+            stage3_eval_result = self._process_evaluation_result(stage3_result)
 
             # Merge stage 3 results
             for name, value in stage3_eval_result.metrics.items():
@@ -453,14 +411,8 @@ async def run_stage3():
 
         except Exception as e:
             logger.error(f"Error in cascade evaluation: {str(e)}")
-            return EvaluationResult(
-                metrics={"error": 0.0},
-                artifacts={
-                    "stderr": str(e),
-                    "traceback": traceback.format_exc(),
-                    "failure_stage": "cascade_setup",
-                },
-            )
+            # Re-raise the exception to allow retry handling at higher level
+            raise
 
     async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[str, float]:
         """
diff --git a/openevolve/utils/async_utils.py b/openevolve/utils/async_utils.py
@@ -33,28 +33,24 @@ async def wrapper(*args: Any, **kwargs: Any) -> Any:
 
 
 async def run_with_timeout(
-    coro: Callable, 
-    timeout: float, 
-    *args: Any, 
-    timeout_error_value: Any = None,
-    **kwargs: Any
+    coro: Callable, timeout: float, *args: Any, timeout_error_value: Any = None, **kwargs: Any
 ) -> Any:
     """
     Run a coroutine with a timeout, returning a default value on timeout
-    
+
     Args:
         coro: Coroutine function to run
         timeout: Timeout in seconds
         *args: Arguments to pass to the coroutine
         timeout_error_value: Value to return on timeout (default: {"error": 0.0, "timeout": True})
         **kwargs: Keyword arguments to pass to the coroutine
-        
+
     Returns:
         Result of the coroutine or timeout_error_value on timeout
     """
     if timeout_error_value is None:
         timeout_error_value = {"error": 0.0, "timeout": True}
-        
+
     try:
         return await asyncio.wait_for(coro(*args, **kwargs), timeout=timeout)
     except asyncio.TimeoutError:
@@ -63,28 +59,24 @@ async def run_with_timeout(
 
 
 async def run_sync_with_timeout(
-    func: Callable, 
-    timeout: float, 
-    *args: Any, 
-    timeout_error_value: Any = None,
-    **kwargs: Any
+    func: Callable, timeout: float, *args: Any, timeout_error_value: Any = None, **kwargs: Any
 ) -> Any:
     """
     Run a synchronous function in an executor with a timeout
-    
+
     Args:
         func: Synchronous function to run
         timeout: Timeout in seconds
         *args: Arguments to pass to the function
         timeout_error_value: Value to return on timeout (default: {"error": 0.0, "timeout": True})
         **kwargs: Keyword arguments to pass to the function
-        
+
     Returns:
         Result of the function or timeout_error_value on timeout
     """
     if timeout_error_value is None:
         timeout_error_value = {"error": 0.0, "timeout": True}
-        
+
     try:
         loop = asyncio.get_event_loop()
         task = loop.run_in_executor(None, functools.partial(func, *args, **kwargs))
diff --git a/tests/test_evaluator_timeout.py b/tests/test_evaluator_timeout.py