Skip to content

Commit 3d783a2

Browse files
authored
Merge pull request #73 from jvm123/feat-prompt-export
Feature: Prompt & response saving
2 parents fa1ff18 + cce83e3 commit 3d783a2

File tree

15 files changed

+467
-37
lines changed

15 files changed

+467
-37
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ help:
1616
@echo " test - Run tests"
1717
@echo " docker-build - Build the Docker image"
1818
@echo " docker-run - Run the Docker container with the example"
19+
@echo " visualizer - Run the visualization script"
1920

2021
.PHONY: all
2122
all: install test

README.md

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ See the [Configuration Guide](configs/default_config.yaml) for a full list of op
185185

186186
## Artifacts Channel
187187

188-
OpenEvolve includes a **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
188+
OpenEvolve includes an **artifacts side-channel** that allows evaluators to capture build errors, profiling results, etc. to provide better feedback to the LLM in subsequent generations. This feature enhances the evolution process by giving the LLM context about what went wrong and how to fix it.
189189

190190
The artifacts channel operates alongside the traditional fitness metrics.
191191

@@ -205,17 +205,28 @@ return EvaluationResult(
205205
```
206206

207207
The next generation prompt will include:
208-
```
208+
```markdown
209209
## Last Execution Output
210210
### Stderr
211-
```
212211
SyntaxError: invalid syntax (line 15)
213-
```
212+
214213
### Traceback
215-
```
216214
...
217215
```
216+
217+
## Example: LLM Feedback
218+
219+
An example for an LLM artifact side channel is part of the default evaluation template, which ends with
220+
```markdown
221+
Return your evaluation as a JSON object with the following format:
222+
{{
223+
"readability": [score],
224+
"maintainability": [score],
225+
"efficiency": [score],
226+
"reasoning": "[brief explanation of scores]"
227+
}}
218228
```
229+
The non-float values, in this case the "reasoning" key of the json response that the evaluator LLM generates, will be available within the next generation prompt.
219230

220231
### Configuration
221232

@@ -240,7 +251,7 @@ export ENABLE_ARTIFACTS=false
240251
### Benefits
241252

242253
- **Faster convergence** - LLMs can see what went wrong and fix it directly
243-
- **Better error handling** - Compilation and runtime failures become learning opportunities
254+
- **Better error handling** - Compilation and runtime failures become learning opportunities
244255
- **Rich debugging context** - Full stack traces and error messages guide improvements
245256
- **Zero overhead** - When disabled, no performance impact on evaluation
246257

configs/default_config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ database:
7171
# General settings
7272
db_path: null # Path to persist database (null = in-memory only)
7373
in_memory: true # Keep database in memory for faster access
74+
log_prompts: true # If true, log all prompts and responses into the database
7475

7576
# Evolutionary parameters
7677
population_size: 1000 # Maximum number of programs to keep in memory

openevolve/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ class DatabaseConfig:
142142
db_path: Optional[str] = None # Path to store database on disk
143143
in_memory: bool = True
144144

145+
# Prompt and response logging to programs/<id>.json
146+
log_prompts: bool = True
147+
145148
# Evolutionary parameters
146149
population_size: int = 1000
147150
archive_size: int = 100
@@ -308,6 +311,7 @@ def to_dict(self) -> Dict[str, Any]:
308311
"migration_interval": self.database.migration_interval,
309312
"migration_rate": self.database.migration_rate,
310313
"random_seed": self.database.random_seed,
314+
"log_prompts": self.database.log_prompts,
311315
},
312316
"evaluator": {
313317
"timeout": self.evaluator.timeout,

openevolve/controller.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import uuid
1111
from pathlib import Path
1212
from typing import Any, Dict, List, Optional, Tuple, Union
13+
import traceback
1314

1415
from openevolve.config import Config, load_config
1516
from openevolve.database import Program, ProgramDatabase
@@ -142,6 +143,7 @@ def __init__(
142143
evaluation_file,
143144
self.llm_evaluator_ensemble,
144145
self.evaluator_prompt_sampler,
146+
database=self.database,
145147
)
146148

147149
logger.info(f"Initialized OpenEvolve with {initial_program_path} " f"and {evaluation_file}")
@@ -335,10 +337,30 @@ async def run(
335337
# Add to database (will be added to current island)
336338
self.database.add(child_program, iteration=i + 1)
337339

340+
# Log prompts
341+
self.database.log_prompt(
342+
template_key=(
343+
"full_rewrite_user" if self.config.allow_full_rewrites else "diff_user"
344+
),
345+
program_id=child_id,
346+
prompt=prompt,
347+
responses=[llm_response],
348+
)
349+
338350
# Store artifacts if they exist
339351
if artifacts:
340352
self.database.store_artifacts(child_id, artifacts)
341353

354+
# Log prompts
355+
self.database.log_prompt(
356+
template_key=(
357+
"full_rewrite_user" if self.config.allow_full_rewrites else "diff_user"
358+
),
359+
program_id=child_id,
360+
prompt=prompt,
361+
responses=[llm_response],
362+
)
363+
342364
# Increment generation for current island
343365
self.database.increment_island_generation()
344366

@@ -384,6 +406,7 @@ async def run(
384406

385407
except Exception as e:
386408
logger.error(f"Error in iteration {i+1}: {str(e)}")
409+
traceback.print_exc()
387410
continue
388411

389412
# Get the best program using our tracking mechanism

openevolve/database.py

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,9 @@ def __init__(self, config: DatabaseConfig):
118118
if config.db_path and os.path.exists(config.db_path):
119119
self.load(config.db_path)
120120

121+
# Prompt log
122+
self.prompts_by_program: Dict[str, Dict[str, Dict[str, str]]] = None
123+
121124
# Set random seed for reproducible sampling if specified
122125
if config.random_seed is not None:
123126
import random
@@ -328,7 +331,14 @@ def save(self, path: Optional[str] = None, iteration: int = 0) -> None:
328331

329332
# Save each program
330333
for program in self.programs.values():
331-
self._save_program(program, save_path)
334+
prompts = None
335+
if (
336+
self.config.log_prompts
337+
and self.prompts_by_program
338+
and program.id in self.prompts_by_program
339+
):
340+
prompts = self.prompts_by_program[program.id]
341+
self._save_program(program, save_path, prompts=prompts)
332342

333343
# Save metadata
334344
metadata = {
@@ -489,13 +499,19 @@ def _distribute_programs_to_islands(self) -> None:
489499

490500
logger.info(f"Distributed {len(program_ids)} programs across {len(self.islands)} islands")
491501

492-
def _save_program(self, program: Program, base_path: Optional[str] = None) -> None:
502+
def _save_program(
503+
self,
504+
program: Program,
505+
base_path: Optional[str] = None,
506+
prompts: Optional[Dict[str, Dict[str, str]]] = None,
507+
) -> None:
493508
"""
494509
Save a program to disk
495510
496511
Args:
497512
program: Program to save
498513
base_path: Base path to save to (uses config.db_path if None)
514+
prompts: Optional prompts to save with the program, in the format {template_key: { 'system': str, 'user': str }}
499515
"""
500516
save_path = base_path or self.config.db_path
501517
if not save_path:
@@ -506,9 +522,13 @@ def _save_program(self, program: Program, base_path: Optional[str] = None) -> No
506522
os.makedirs(programs_dir, exist_ok=True)
507523

508524
# Save program
525+
program_dict = program.to_dict()
526+
if prompts:
527+
program_dict["prompts"] = prompts
509528
program_path = os.path.join(programs_dir, f"{program.id}.json")
529+
510530
with open(program_path, "w") as f:
511-
json.dump(program.to_dict(), f)
531+
json.dump(program_dict, f)
512532

513533
def _calculate_feature_coords(self, program: Program) -> List[int]:
514534
"""
@@ -1288,3 +1308,35 @@ def _load_artifact_dir(self, artifact_dir: str) -> Dict[str, Union[str, bytes]]:
12881308
logger.warning(f"Failed to list artifact directory {artifact_dir}: {e}")
12891309

12901310
return artifacts
1311+
1312+
def log_prompt(
1313+
self,
1314+
program_id: str,
1315+
template_key: str,
1316+
prompt: Dict[str, str],
1317+
responses: Optional[List[str]] = None,
1318+
) -> None:
1319+
"""
1320+
Log a prompt for a program.
1321+
Only logs if self.config.log_prompts is True.
1322+
1323+
Args:
1324+
program_id: ID of the program to log the prompt for
1325+
template_key: Key for the prompt template
1326+
prompt: Prompts in the format {template_key: { 'system': str, 'user': str }}.
1327+
responses: Optional list of responses to the prompt, if available.
1328+
"""
1329+
1330+
if not self.config.log_prompts:
1331+
return
1332+
1333+
if responses is None:
1334+
responses = []
1335+
prompt["responses"] = responses
1336+
1337+
if self.prompts_by_program is None:
1338+
self.prompts_by_program = {}
1339+
1340+
if program_id not in self.prompts_by_program:
1341+
self.prompts_by_program[program_id] = {}
1342+
self.prompts_by_program[program_id][template_key] = prompt

openevolve/evaluator.py

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
import traceback
1919

2020
from openevolve.config import EvaluatorConfig
21+
from openevolve.database import ProgramDatabase
2122
from openevolve.evaluation_result import EvaluationResult
23+
from openevolve.database import ProgramDatabase
2224
from openevolve.llm.ensemble import LLMEnsemble
2325
from openevolve.utils.async_utils import TaskPool, run_in_executor
2426
from openevolve.prompt.sampler import PromptSampler
@@ -41,11 +43,13 @@ def __init__(
4143
evaluation_file: str,
4244
llm_ensemble: Optional[LLMEnsemble] = None,
4345
prompt_sampler: Optional[PromptSampler] = None,
46+
database: Optional[ProgramDatabase] = None,
4447
):
4548
self.config = config
4649
self.evaluation_file = evaluation_file
4750
self.llm_ensemble = llm_ensemble
4851
self.prompt_sampler = prompt_sampler
52+
self.database = database
4953

5054
# Create a task pool for parallel evaluation
5155
self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations)
@@ -131,16 +135,40 @@ async def evaluate_program(
131135
eval_result = self._process_evaluation_result(result)
132136

133137
# Add LLM feedback if configured
138+
llm_eval_result = None
134139
if self.config.use_llm_feedback and self.llm_ensemble:
135-
feedback_metrics = await self._llm_evaluate(program_code)
140+
llm_result = await self._llm_evaluate(program_code, program_id=program_id)
141+
llm_eval_result = self._process_evaluation_result(llm_result)
136142

137143
# Combine metrics
138-
for name, value in feedback_metrics.items():
144+
for name, value in llm_result.metrics.items():
139145
eval_result.metrics[f"llm_{name}"] = value * self.config.llm_feedback_weight
140146

141147
# Store artifacts if enabled and present
142-
if artifacts_enabled and eval_result.has_artifacts() and program_id:
143-
self._pending_artifacts[program_id] = eval_result.artifacts
148+
if (
149+
artifacts_enabled
150+
and (
151+
eval_result.has_artifacts()
152+
or (llm_eval_result and llm_eval_result.has_artifacts())
153+
)
154+
and program_id
155+
):
156+
self._pending_artifacts[program_id] = {}
157+
158+
# Merge eval_result artifacts with llm artifacts if they exist
159+
if eval_result.has_artifacts():
160+
self._pending_artifacts[program_id].update(eval_result.artifacts)
161+
logger.debug(
162+
f"Program{program_id_str} returned artifacts: "
163+
f"{eval_result.artifacts}"
164+
)
165+
166+
if llm_eval_result and llm_eval_result.has_artifacts():
167+
self._pending_artifacts[program_id].update(llm_eval_result.artifacts)
168+
logger.debug(
169+
f"Program{program_id_str} returned LLM artifacts: "
170+
f"{llm_eval_result.artifacts}"
171+
)
144172

145173
elapsed = time.time() - start_time
146174
logger.info(
@@ -156,6 +184,7 @@ async def evaluate_program(
156184
logger.warning(
157185
f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}"
158186
)
187+
traceback.print_exc()
159188

160189
# Capture failure artifacts if enabled
161190
if artifacts_enabled and program_id:
@@ -378,12 +407,13 @@ async def _cascade_evaluate(
378407
},
379408
)
380409

381-
async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
410+
async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[str, float]:
382411
"""
383412
Use LLM to evaluate code quality
384413
385414
Args:
386415
program_code: Code to evaluate
416+
program_id: Optional ID for logging
387417
388418
Returns:
389419
Dictionary of metric name to score
@@ -402,12 +432,22 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
402432
prompt["system"], [{"role": "user", "content": prompt["user"]}]
403433
)
404434

435+
# Log prompt and response to database
436+
if self.database and program_id:
437+
self.database.log_prompt(
438+
program_id=program_id,
439+
template_key="evaluation",
440+
prompt=prompt,
441+
responses=responses,
442+
)
443+
405444
# Extract JSON from response
406445
try:
407446
# Try to find JSON block
408447
json_pattern = r"```json\n(.*?)\n```"
409448
import re
410449

450+
artifacts = {}
411451
avg_metrics = {}
412452
for i, response in enumerate(responses):
413453
json_match = re.search(json_pattern, response, re.DOTALL)
@@ -426,12 +466,13 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
426466
# Parse JSON
427467
result = json.loads(json_str)
428468

429-
# Filter all non-numeric values
430-
metrics = {
431-
name: float(value)
432-
for name, value in result.items()
433-
if isinstance(value, (int, float))
434-
}
469+
# All non-numeric values are artifacts, all numeric values are metrics
470+
metrics = {}
471+
for key, value in result.items():
472+
if not isinstance(value, (int, float)):
473+
artifacts[key] = value
474+
else:
475+
metrics[key] = float(value)
435476

436477
# Weight of the model in the ensemble
437478
weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
@@ -443,7 +484,10 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
443484
else:
444485
avg_metrics[name] = value * weight
445486

446-
return avg_metrics
487+
return EvaluationResult(
488+
metrics=avg_metrics,
489+
artifacts=artifacts,
490+
)
447491

448492
except Exception as e:
449493
logger.warning(f"Error parsing LLM response: {str(e)}")

0 commit comments

Comments
 (0)