Skip to content

Commit d1d1a57

Browse files
authored
Merge pull request #68 from jvm123/feat-llm-artifact-side-channel
Feature: Artifact side channel responses for LLM feedback
2 parents 59872a7 + 0fe928f commit d1d1a57

File tree

2 files changed

+50
-17
lines changed

2 files changed

+50
-17
lines changed

README.md

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ See the [Configuration Guide](configs/default_config.yaml) for a full list of op
185185

186186
## Artifacts Channel
187187

188-
OpenEvolve includes a **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
188+
OpenEvolve includes an **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
189189

190190
The artifacts channel operates alongside the traditional fitness metrics.
191191

@@ -205,17 +205,28 @@ return EvaluationResult(
205205
```
206206

207207
The next generation prompt will include:
208-
```
208+
```markdown
209209
## Last Execution Output
210210
### Stderr
211-
```
212211
SyntaxError: invalid syntax (line 15)
213-
```
212+
214213
### Traceback
215-
```
216214
...
217215
```
216+
217+
## Example: LLM Feedback
218+
219+
An example for an LLM artifact side channel is part of the default evaluation prompt template, which ends with
220+
```markdown
221+
Return your evaluation as a JSON object with the following format:
222+
{{
223+
"readability": [score],
224+
"maintainability": [score],
225+
"efficiency": [score],
226+
"reasoning": "[brief explanation of scores]"
227+
}}
218228
```
229+
The non-float values, in this case the "reasoning" key of the json response that the evaluator LLM generates, will be available within the next generation prompt.
219230

220231
### Configuration
221232

@@ -240,7 +251,7 @@ export ENABLE_ARTIFACTS=false
240251
### Benefits
241252

242253
- **Faster convergence** - LLMs can see what went wrong and fix it directly
243-
- **Better error handling** - Compilation and runtime failures become learning opportunities
254+
- **Better error handling** - Compilation and runtime failures become learning opportunities
244255
- **Rich debugging context** - Full stack traces and error messages guide improvements
245256
- **Zero overhead** - When disabled, no performance impact on evaluation
246257

openevolve/evaluator.py

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -131,16 +131,32 @@ async def evaluate_program(
131131
eval_result = self._process_evaluation_result(result)
132132

133133
# Add LLM feedback if configured
134+
llm_eval_result = None
134135
if self.config.use_llm_feedback and self.llm_ensemble:
135-
feedback_metrics = await self._llm_evaluate(program_code)
136+
llm_result = await self._llm_evaluate(program_code)
137+
llm_eval_result = self._process_evaluation_result(llm_result)
136138

137139
# Combine metrics
138-
for name, value in feedback_metrics.items():
140+
for name, value in llm_result.metrics.items():
139141
eval_result.metrics[f"llm_{name}"] = value * self.config.llm_feedback_weight
140142

141143
# Store artifacts if enabled and present
142-
if artifacts_enabled and eval_result.has_artifacts() and program_id:
143-
self._pending_artifacts[program_id] = eval_result.artifacts
144+
if (
145+
artifacts_enabled
146+
and (
147+
eval_result.has_artifacts()
148+
or (llm_eval_result and llm_eval_result.has_artifacts())
149+
)
150+
and program_id
151+
):
152+
self._pending_artifacts[program_id] = {}
153+
154+
# Merge eval_result artifacts with llm artifacts if they exist
155+
if eval_result.has_artifacts():
156+
self._pending_artifacts[program_id].update(eval_result.artifacts)
157+
158+
if llm_eval_result and llm_eval_result.has_artifacts():
159+
self._pending_artifacts[program_id].update(llm_eval_result.artifacts)
144160

145161
elapsed = time.time() - start_time
146162
logger.info(
@@ -156,6 +172,7 @@ async def evaluate_program(
156172
logger.warning(
157173
f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}"
158174
)
175+
traceback.print_exc()
159176

160177
# Capture failure artifacts if enabled
161178
if artifacts_enabled and program_id:
@@ -408,6 +425,7 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
408425
json_pattern = r"```json\n(.*?)\n```"
409426
import re
410427

428+
artifacts = {}
411429
avg_metrics = {}
412430
for i, response in enumerate(responses):
413431
json_match = re.search(json_pattern, response, re.DOTALL)
@@ -426,12 +444,13 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
426444
# Parse JSON
427445
result = json.loads(json_str)
428446

429-
# Filter all non-numeric values
430-
metrics = {
431-
name: float(value)
432-
for name, value in result.items()
433-
if isinstance(value, (int, float))
434-
}
447+
# All non-numeric values are artifacts, all numeric values are metrics
448+
metrics = {}
449+
for key, value in result.items():
450+
if not isinstance(value, (int, float)):
451+
artifacts[key] = value
452+
else:
453+
metrics[key] = float(value)
435454

436455
# Weight of the model in the ensemble
437456
weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
@@ -443,7 +462,10 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
443462
else:
444463
avg_metrics[name] = value * weight
445464

446-
return avg_metrics
465+
return EvaluationResult(
466+
metrics=avg_metrics,
467+
artifacts=artifacts,
468+
)
447469

448470
except Exception as e:
449471
logger.warning(f"Error parsing LLM response: {str(e)}")

0 commit comments

Comments
 (0)