Revert "Merge pull request #68 from jvm123/feat-llm-artifact-side-channel"

codelion · codelion · commit c8dd75af54cd · 2025-06-19T15:02:06.000+08:00
This reverts commit d1d1a57, reversing changes made to 59872a7.
diff --git a/README.md b/README.md
@@ -185,7 +185,7 @@ See the [Configuration Guide](configs/default_config.yaml) for a full list of op
 
 ## Artifacts Channel
 
-OpenEvolve includes an **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
+OpenEvolve includes a **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
 
 The artifacts channel operates alongside the traditional fitness metrics.
 
@@ -205,28 +205,17 @@ return EvaluationResult(
 ```
 
 The next generation prompt will include:
-```markdown
+```
 ## Last Execution Output
 ### Stderr
+```
 SyntaxError: invalid syntax (line 15)
-
+```
 ### Traceback
+```
 ...
 ```
-
-## Example: LLM Feedback
-
-An example for an LLM artifact side channel is part of the default evaluation prompt template, which ends with
-```markdown
-Return your evaluation as a JSON object with the following format:
-{{
-    "readability": [score],
-    "maintainability": [score],
-    "efficiency": [score],
-    "reasoning": "[brief explanation of scores]"
-}}
 ```
-The non-float values, in this case the "reasoning" key of the json response that the evaluator LLM generates, will be available within the next generation prompt.
 
 ### Configuration
 
@@ -251,7 +240,7 @@ export ENABLE_ARTIFACTS=false
 ### Benefits
 
 - **Faster convergence** - LLMs can see what went wrong and fix it directly
-- **Better error handling** - Compilation and runtime failures become learning opportunities
+- **Better error handling** - Compilation and runtime failures become learning opportunities  
 - **Rich debugging context** - Full stack traces and error messages guide improvements
 - **Zero overhead** - When disabled, no performance impact on evaluation
 
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
@@ -131,32 +131,16 @@ async def evaluate_program(
                 eval_result = self._process_evaluation_result(result)
 
                 # Add LLM feedback if configured
-                llm_eval_result = None
                 if self.config.use_llm_feedback and self.llm_ensemble:
-                    llm_result = await self._llm_evaluate(program_code)
-                    llm_eval_result = self._process_evaluation_result(llm_result)
+                    feedback_metrics = await self._llm_evaluate(program_code)
 
                     # Combine metrics
-                    for name, value in llm_result.metrics.items():
+                    for name, value in feedback_metrics.items():
                         eval_result.metrics[f"llm_{name}"] = value * self.config.llm_feedback_weight
 
                 # Store artifacts if enabled and present
-                if (
-                    artifacts_enabled
-                    and (
-                        eval_result.has_artifacts()
-                        or (llm_eval_result and llm_eval_result.has_artifacts())
-                    )
-                    and program_id
-                ):
-                    self._pending_artifacts[program_id] = {}
-
-                    # Merge eval_result artifacts with llm artifacts if they exist
-                    if eval_result.has_artifacts():
-                        self._pending_artifacts[program_id].update(eval_result.artifacts)
-
-                    if llm_eval_result and llm_eval_result.has_artifacts():
-                        self._pending_artifacts[program_id].update(llm_eval_result.artifacts)
+                if artifacts_enabled and eval_result.has_artifacts() and program_id:
+                    self._pending_artifacts[program_id] = eval_result.artifacts
 
                 elapsed = time.time() - start_time
                 logger.info(
@@ -172,7 +156,6 @@ async def evaluate_program(
                 logger.warning(
                     f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}"
                 )
-                traceback.print_exc()
 
                 # Capture failure artifacts if enabled
                 if artifacts_enabled and program_id:
@@ -425,7 +408,6 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                 json_pattern = r"```json\n(.*?)\n```"
                 import re
 
-                artifacts = {}
                 avg_metrics = {}
                 for i, response in enumerate(responses):
                     json_match = re.search(json_pattern, response, re.DOTALL)
@@ -444,13 +426,12 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                     # Parse JSON
                     result = json.loads(json_str)
 
-                    # All non-numeric values are artifacts, all numeric values are metrics
-                    metrics = {}
-                    for key, value in result.items():
-                        if not isinstance(value, (int, float)):
-                            artifacts[key] = value
-                        else:
-                            metrics[key] = float(value)
+                    # Filter all non-numeric values
+                    metrics = {
+                        name: float(value)
+                        for name, value in result.items()
+                        if isinstance(value, (int, float))
+                    }
 
                     # Weight of the model in the ensemble
                     weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
@@ -462,10 +443,7 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                         else:
                             avg_metrics[name] = value * weight
 
-                return EvaluationResult(
-                    metrics=avg_metrics,
-                    artifacts=artifacts,
-                )
+                return avg_metrics
 
             except Exception as e:
                 logger.warning(f"Error parsing LLM response: {str(e)}")