Prompt logging to database added

jvm123 · jvm123 · commit 0490d6e7da4e · 2025-06-16T02:49:10.000+02:00
diff --git a/configs/default_config.yaml b/configs/default_config.yaml
@@ -71,6 +71,7 @@ database:
   # General settings
   db_path: null                       # Path to persist database (null = in-memory only)
   in_memory: true                     # Keep database in memory for faster access
+  log_prompts: true                  # If true, log all prompts and responses into the database
 
   # Evolutionary parameters
   population_size: 1000               # Maximum number of programs to keep in memory
diff --git a/openevolve/config.py b/openevolve/config.py
@@ -142,6 +142,9 @@ class DatabaseConfig:
     db_path: Optional[str] = None  # Path to store database on disk
     in_memory: bool = True
 
+    # Prompt and response logging to programs/<id>.json
+    log_prompts: bool = True
+
     # Evolutionary parameters
     population_size: int = 1000
     archive_size: int = 100
@@ -308,6 +311,7 @@ def to_dict(self) -> Dict[str, Any]:
                 "migration_interval": self.database.migration_interval,
                 "migration_rate": self.database.migration_rate,
                 "random_seed": self.database.random_seed,
+                "log_prompts": self.database.log_prompts,
             },
             "evaluator": {
                 "timeout": self.evaluator.timeout,
diff --git a/openevolve/controller.py b/openevolve/controller.py
@@ -9,7 +9,8 @@
 import time
 import uuid
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
+import traceback
 
 from openevolve.config import Config, load_config
 from openevolve.database import Program, ProgramDatabase
@@ -114,6 +115,7 @@ def __init__(
             evaluation_file,
             self.llm_evaluator_ensemble,
             self.evaluator_prompt_sampler,
+            database=self.database,
         )
 
         logger.info(f"Initialized OpenEvolve with {initial_program_path} " f"and {evaluation_file}")
@@ -307,10 +309,30 @@ async def run(
                 # Add to database (will be added to current island)
                 self.database.add(child_program, iteration=i + 1)
 
+                # Log prompts
+                self.database.log_prompt(
+                    template_key=(
+                        "full_rewrite_user" if self.config.allow_full_rewrites else "diff_user"
+                    ),
+                    program_id=child_id,
+                    prompt=prompt,
+                    responses=[llm_response],
+                )
+
                 # Store artifacts if they exist
                 if artifacts:
                     self.database.store_artifacts(child_id, artifacts)
 
+                # Log prompts
+                self.database.log_prompt(
+                    template_key=(
+                        "full_rewrite_user" if self.config.allow_full_rewrites else "diff_user"
+                    ),
+                    program_id=child_id,
+                    prompt=prompt,
+                    responses=[llm_response],
+                )
+
                 # Increment generation for current island
                 self.database.increment_island_generation()
 
@@ -347,6 +369,7 @@ async def run(
 
             except Exception as e:
                 logger.error(f"Error in iteration {i+1}: {str(e)}")
+                traceback.print_exc()
                 continue
 
         # Get the best program using our tracking mechanism
diff --git a/openevolve/database.py b/openevolve/database.py
@@ -104,6 +104,9 @@ def __init__(self, config: DatabaseConfig):
         if config.db_path and os.path.exists(config.db_path):
             self.load(config.db_path)
 
+        # Prompt log
+        self.prompts_by_program: Dict[str, Dict[str, Dict[str, str]]] = None
+
         # Set random seed for reproducible sampling if specified
         if config.random_seed is not None:
             import random
@@ -314,7 +317,14 @@ def save(self, path: Optional[str] = None, iteration: int = 0) -> None:
 
         # Save each program
         for program in self.programs.values():
-            self._save_program(program, save_path)
+            prompts = None
+            if (
+                self.config.log_prompts
+                and self.prompts_by_program
+                and program.id in self.prompts_by_program
+            ):
+                prompts = self.prompts_by_program[program.id]
+            self._save_program(program, save_path, prompts=prompts)
 
         # Save metadata
         metadata = {
@@ -382,13 +392,19 @@ def load(self, path: str) -> None:
 
         logger.info(f"Loaded database with {len(self.programs)} programs from {path}")
 
-    def _save_program(self, program: Program, base_path: Optional[str] = None) -> None:
+    def _save_program(
+        self,
+        program: Program,
+        base_path: Optional[str] = None,
+        prompts: Optional[Dict[str, Dict[str, str]]] = None,
+    ) -> None:
         """
         Save a program to disk
 
         Args:
             program: Program to save
             base_path: Base path to save to (uses config.db_path if None)
+            prompts: Optional prompts to save with the program, in the format {template_key: { 'system': str, 'user': str }}
         """
         save_path = base_path or self.config.db_path
         if not save_path:
@@ -399,9 +415,13 @@ def _save_program(self, program: Program, base_path: Optional[str] = None) -> No
         os.makedirs(programs_dir, exist_ok=True)
 
         # Save program
+        program_dict = program.to_dict()
+        if prompts:
+            program_dict["prompts"] = prompts
         program_path = os.path.join(programs_dir, f"{program.id}.json")
+
         with open(program_path, "w") as f:
-            json.dump(program.to_dict(), f)
+            json.dump(program_dict, f)
 
     def _calculate_feature_coords(self, program: Program) -> List[int]:
         """
@@ -1079,3 +1099,35 @@ def _load_artifact_dir(self, artifact_dir: str) -> Dict[str, Union[str, bytes]]:
             logger.warning(f"Failed to list artifact directory {artifact_dir}: {e}")
 
         return artifacts
+
+    def log_prompt(
+        self,
+        program_id: str,
+        template_key: str,
+        prompt: Dict[str, str],
+        responses: Optional[List[str]] = None,
+    ) -> None:
+        """
+        Log a prompt for a program.
+        Only logs if self.config.log_prompts is True.
+
+        Args:
+        program_id: ID of the program to log the prompt for
+        template_key: Key for the prompt template
+        prompt: Prompts in the format {template_key: { 'system': str, 'user': str }}.
+        responses: Optional list of responses to the prompt, if available.
+        """
+
+        if not self.config.log_prompts:
+            return
+
+        if responses is None:
+            responses = []
+        prompt["responses"] = responses
+
+        if self.prompts_by_program is None:
+            self.prompts_by_program = {}
+
+        if program_id not in self.prompts_by_program:
+            self.prompts_by_program[program_id] = {}
+        self.prompts_by_program[program_id][template_key] = prompt
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
@@ -18,7 +18,9 @@
 import traceback
 
 from openevolve.config import EvaluatorConfig
+from openevolve.database import ProgramDatabase
 from openevolve.evaluation_result import EvaluationResult
+from openevolve.database import ProgramDatabase
 from openevolve.llm.ensemble import LLMEnsemble
 from openevolve.utils.async_utils import TaskPool, run_in_executor
 from openevolve.prompt.sampler import PromptSampler
@@ -41,11 +43,13 @@ def __init__(
         evaluation_file: str,
         llm_ensemble: Optional[LLMEnsemble] = None,
         prompt_sampler: Optional[PromptSampler] = None,
+        database: Optional[ProgramDatabase] = None,
     ):
         self.config = config
         self.evaluation_file = evaluation_file
         self.llm_ensemble = llm_ensemble
         self.prompt_sampler = prompt_sampler
+        self.database = database
 
         # Create a task pool for parallel evaluation
         self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations)
@@ -127,7 +131,7 @@ async def evaluate_program(
                 # Add LLM feedback if configured
                 llm_eval_result = None
                 if self.config.use_llm_feedback and self.llm_ensemble:
-                    llm_result = await self._llm_evaluate(program_code)
+                    llm_result = await self._llm_evaluate(program_code, program_id=program_id)
                     llm_eval_result = self._process_evaluation_result(llm_result)
 
                     # Combine metrics
@@ -148,9 +152,17 @@ async def evaluate_program(
                     # Merge eval_result artifacts with llm artifacts if they exist
                     if eval_result.has_artifacts():
                         self._pending_artifacts[program_id].update(eval_result.artifacts)
+                        logger.debug(
+                            f"Program{program_id_str} returned artifacts: "
+                            f"{eval_result.artifacts}"
+                        )
 
                     if llm_eval_result and llm_eval_result.has_artifacts():
                         self._pending_artifacts[program_id].update(llm_eval_result.artifacts)
+                        logger.debug(
+                            f"Program{program_id_str} returned LLM artifacts: "
+                            f"{llm_eval_result.artifacts}"
+                        )
 
                 elapsed = time.time() - start_time
                 logger.info(
@@ -166,6 +178,7 @@ async def evaluate_program(
                 logger.warning(
                     f"Evaluation attempt {attempt + 1}/{self.config.max_retries + 1} failed for program{program_id_str}: {str(e)}"
                 )
+                traceback.print_exc()
 
                 # Capture failure artifacts if enabled
                 if artifacts_enabled and program_id:
@@ -382,12 +395,13 @@ async def _cascade_evaluate(
                 },
             )
 
-    async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
+    async def _llm_evaluate(self, program_code: str, program_id: str = "") -> Dict[str, float]:
         """
         Use LLM to evaluate code quality
 
         Args:
             program_code: Code to evaluate
+            program_id: Optional ID for logging
 
         Returns:
             Dictionary of metric name to score
@@ -406,6 +420,15 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                 prompt["system"], [{"role": "user", "content": prompt["user"]}]
             )
 
+            # Log prompt and response to database
+            if self.database and program_id:
+                self.database.log_prompt(
+                    program_id=program_id,
+                    template_key="evaluation",
+                    prompt=prompt,
+                    responses=responses,
+                )
+
             # Extract JSON from response
             try:
                 # Try to find JSON block