Skip to content

Commit c8dd75a

Browse files
committed
Revert "Merge pull request #68 from jvm123/feat-llm-artifact-side-channel"
This reverts commit d1d1a57, reversing changes made to 59872a7.
1 parent d1d1a57 commit c8dd75a

File tree

2 files changed

+17
-50
lines changed

2 files changed

+17
-50
lines changed

README.md

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ See the [Configuration Guide](configs/default_config.yaml) for a full list of op
185185

186186
## Artifacts Channel
187187

188-
OpenEvolve includes an **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
188+
OpenEvolve includes a **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
189189

190190
The artifacts channel operates alongside the traditional fitness metrics.
191191

@@ -205,28 +205,17 @@ return EvaluationResult(
205205
```
206206

207207
The next generation prompt will include:
208-
```markdown
208+
```
209209
## Last Execution Output
210210
### Stderr
211+
```
211212
SyntaxError: invalid syntax (line 15)
212-
213+
```
213214
### Traceback
215+
```
214216
...
215217
```
216-
217-
## Example: LLM Feedback
218-
219-
An example for an LLM artifact side channel is part of the default evaluation prompt template, which ends with
220-
```markdown
221-
Return your evaluation as a JSON object with the following format:
222-
{{
223-
"readability": [score],
224-
"maintainability": [score],
225-
"efficiency": [score],
226-
"reasoning": "[brief explanation of scores]"
227-
}}
228218
```
229-
The non-float values, in this case the "reasoning" key of the json response that the evaluator LLM generates, will be available within the next generation prompt.
230219
231220
### Configuration
232221
@@ -251,7 +240,7 @@ export ENABLE_ARTIFACTS=false
251240
### Benefits
252241

253242
- **Faster convergence** - LLMs can see what went wrong and fix it directly
254-
- **Better error handling** - Compilation and runtime failures become learning opportunities
243+
- **Better error handling** - Compilation and runtime failures become learning opportunities
255244
- **Rich debugging context** - Full stack traces and error messages guide improvements
256245
- **Zero overhead** - When disabled, no performance impact on evaluation
257246

openevolve/evaluator.py

Lines changed: 11 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -131,32 +131,16 @@ async def evaluate_program(
131131
eval_result = self._process_evaluation_result(result)
132132

133133
# Add LLM feedback if configured
134-
llm_eval_result = None
135134
if self.config.use_llm_feedback and self.llm_ensemble:
136-
llm_result = await self._llm_evaluate(program_code)
137-
llm_eval_result = self._process_evaluation_result(llm_result)
135+
feedback_metrics = await self._llm_evaluate(program_code)
138136

139137
# Combine metrics
140-
for name, value in llm_result.metrics.items():
138+
for name, value in feedback_metrics.items():
141139
eval_result.metrics[f"llm_{name}"] = value * self.config.llm_feedback_weight
142140

143141
# Store artifacts if enabled and present
144-
if (
145-
artifacts_enabled
146-
and (
147-
eval_result.has_artifacts()
148-
or (llm_eval_result and llm_eval_result.has_artifacts())
149-
)
150-
and program_id
151-
):
152-
self._pending_artifacts[program_id] = {}
153-
154-
# Merge eval_result artifacts with llm artifacts if they exist
155-
if eval_result.has_artifacts():
156-
self._pending_artifacts[program_id].update(eval_result.artifacts)
157-
158-
if llm_eval_result and llm_eval_result.has_artifacts():
159-
self._pending_artifacts[program_id].update(llm_eval_result.artifacts)
142+
if artifacts_enabled and eval_result.has_artifacts() and program_id:
143+
self._pending_artifacts[program_id] = eval_result.artifacts
160144

161145
elapsed = time.time() - start_time
162146
logger.info(
@@ -172,7 +156,6 @@ async def evaluate_program(
172156
logger.warning(
173157
f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}"
174158
)
175-
traceback.print_exc()
176159

177160
# Capture failure artifacts if enabled
178161
if artifacts_enabled and program_id:
@@ -425,7 +408,6 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
425408
json_pattern = r"```json\n(.*?)\n```"
426409
import re
427410

428-
artifacts = {}
429411
avg_metrics = {}
430412
for i, response in enumerate(responses):
431413
json_match = re.search(json_pattern, response, re.DOTALL)
@@ -444,13 +426,12 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
444426
# Parse JSON
445427
result = json.loads(json_str)
446428

447-
# All non-numeric values are artifacts, all numeric values are metrics
448-
metrics = {}
449-
for key, value in result.items():
450-
if not isinstance(value, (int, float)):
451-
artifacts[key] = value
452-
else:
453-
metrics[key] = float(value)
429+
# Filter all non-numeric values
430+
metrics = {
431+
name: float(value)
432+
for name, value in result.items()
433+
if isinstance(value, (int, float))
434+
}
454435

455436
# Weight of the model in the ensemble
456437
weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
@@ -462,10 +443,7 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
462443
else:
463444
avg_metrics[name] = value * weight
464445

465-
return EvaluationResult(
466-
metrics=avg_metrics,
467-
artifacts=artifacts,
468-
)
446+
return avg_metrics
469447

470448
except Exception as e:
471449
logger.warning(f"Error parsing LLM response: {str(e)}")

0 commit comments

Comments
 (0)