Merge pull request #68 from jvm123/feat-llm-artifact-side-channel

codelion · web-flow · commit d1d1a5703b96 · 2025-06-19T15:01:27.000+08:00
Feature: Artifact side channel responses for LLM feedback
diff --git a/README.md b/README.md
@@ -185,7 +185,7 @@ See the [Configuration Guide](configs/default_config.yaml) for a full list of op
 
 ## Artifacts Channel
 
-OpenEvolve includes a **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
+OpenEvolve includes an **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
 
 The artifacts channel operates alongside the traditional fitness metrics.
 
@@ -205,17 +205,28 @@ return EvaluationResult(
 ```
 
 The next generation prompt will include:
-```
+```markdown
 ## Last Execution Output
 ### Stderr
-```
 SyntaxError: invalid syntax (line 15)
-```
+
 ### Traceback
-```
 ...
 ```
+
+## Example: LLM Feedback
+
+An example for an LLM artifact side channel is part of the default evaluation prompt template, which ends with
+```markdown
+Return your evaluation as a JSON object with the following format:
+{{
+    "readability": [score],
+    "maintainability": [score],
+    "efficiency": [score],
+    "reasoning": "[brief explanation of scores]"
+}}
 ```
+The non-float values, in this case the "reasoning" key of the json response that the evaluator LLM generates, will be available within the next generation prompt.
 
 ### Configuration
 
@@ -240,7 +251,7 @@ export ENABLE_ARTIFACTS=false
 ### Benefits
 
 - **Faster convergence** - LLMs can see what went wrong and fix it directly
-- **Better error handling** - Compilation and runtime failures become learning opportunities  
+- **Better error handling** - Compilation and runtime failures become learning opportunities
 - **Rich debugging context** - Full stack traces and error messages guide improvements
 - **Zero overhead** - When disabled, no performance impact on evaluation
 
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
@@ -131,16 +131,32 @@ async def evaluate_program(
                 eval_result = self._process_evaluation_result(result)
 
                 # Add LLM feedback if configured
+                llm_eval_result = None
                 if self.config.use_llm_feedback and self.llm_ensemble:
-                    feedback_metrics = await self._llm_evaluate(program_code)
+                    llm_result = await self._llm_evaluate(program_code)
+                    llm_eval_result = self._process_evaluation_result(llm_result)
 
                     # Combine metrics
-                    for name, value in feedback_metrics.items():
+                    for name, value in llm_result.metrics.items():
                         eval_result.metrics[f"llm_{name}"] = value * self.config.llm_feedback_weight
 
                 # Store artifacts if enabled and present
-                if artifacts_enabled and eval_result.has_artifacts() and program_id:
-                    self._pending_artifacts[program_id] = eval_result.artifacts
+                if (
+                    artifacts_enabled
+                    and (
+                        eval_result.has_artifacts()
+                        or (llm_eval_result and llm_eval_result.has_artifacts())
+                    )
+                    and program_id
+                ):
+                    self._pending_artifacts[program_id] = {}
+
+                    # Merge eval_result artifacts with llm artifacts if they exist
+                    if eval_result.has_artifacts():
+                        self._pending_artifacts[program_id].update(eval_result.artifacts)
+
+                    if llm_eval_result and llm_eval_result.has_artifacts():
+                        self._pending_artifacts[program_id].update(llm_eval_result.artifacts)
 
                 elapsed = time.time() - start_time
                 logger.info(
@@ -156,6 +172,7 @@ async def evaluate_program(
                 logger.warning(
                     f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}"
                 )
+                traceback.print_exc()
 
                 # Capture failure artifacts if enabled
                 if artifacts_enabled and program_id:
@@ -408,6 +425,7 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                 json_pattern = r"```json\n(.*?)\n```"
                 import re
 
+                artifacts = {}
                 avg_metrics = {}
                 for i, response in enumerate(responses):
                     json_match = re.search(json_pattern, response, re.DOTALL)
@@ -426,12 +444,13 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                     # Parse JSON
                     result = json.loads(json_str)
 
-                    # Filter all non-numeric values
-                    metrics = {
-                        name: float(value)
-                        for name, value in result.items()
-                        if isinstance(value, (int, float))
-                    }
+                    # All non-numeric values are artifacts, all numeric values are metrics
+                    metrics = {}
+                    for key, value in result.items():
+                        if not isinstance(value, (int, float)):
+                            artifacts[key] = value
+                        else:
+                            metrics[key] = float(value)
 
                     # Weight of the model in the ensemble
                     weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
@@ -443,7 +462,10 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                         else:
                             avg_metrics[name] = value * weight
 
-                return avg_metrics
+                return EvaluationResult(
+                    metrics=avg_metrics,
+                    artifacts=artifacts,
+                )
 
             except Exception as e:
                 logger.warning(f"Error parsing LLM response: {str(e)}")