From f84be60af0c60ce0a0ec4cc883c1f697baa7dec5 Mon Sep 17 00:00:00 2001
From: Julian <git@derjulian.net>
Date: Mon, 2 Jun 2025 23:57:58 +0200
Subject: [PATCH 1/2] Better support for LLM feedback and handling of LLM
 ensembles.

- config.py supports configuration of n-model ensembles for evolution, and, optionally a separate ensemble for evaluation; backwards compatible yaml format; settings can be made for all models in llm: or for a specific model in llm:models; new evaluator_system_message setting
- ensemble.py supports n model ensembles
- OpenAILLM supports individual parameter config per model
- ensemble.py has a new generate_all_with_context() function
- evaluator.py uses prompt sampler to generate llm feedback prompts
- templates.py contains default prompts for llm feedback

With the function_minimization example, set use_llm_feedback: true in its config.yaml.
The LLM feedback will provide output such as
`
{
    "readability": 0.92,
    "maintainability": 0.88,
    "efficiency": 0.82,
    "reasoning": "The code is quite readable, with clear function and variable names, concise comments, and a docstring explaining the purpose and arguments of the main search function. There is some minor room for improvement, such as splitting up large inner loops or extracting repeated logic, but overall it is easy to follow. Maintainability is high due to modularization and descriptive naming, but could be slightly improved by reducing the nesting level and possibly moving the annealing routine to its own top-level function. Efficiency is good for a simple global optimization approach; vectorized numpy operations are used where appropriate, and the population-based simulated annealing is a reasonable trade-off between exploration and exploitation. However, the algorithm could be further optimized (e.g., by fully vectorizing more of the walker updates or parallelizing restarts), and the approach is not the most efficient for high-dimensional or more complex landscapes."
}
`
The evolution can then consider the additional values:
`
Evolution complete!
Best program metrics:
  runs_successfully: 1.0000
  value_score: 0.9997
  distance_score: 0.9991
  overall_score: 0.9905
  standard_deviation_score: 0.9992
  speed_score: 0.0610
  reliability_score: 1.0000
  combined_score: 0.9525
  success_rate: 1.0000
  llm_readability: 0.0904
  llm_maintainability: 0.0816
  llm_efficiency: 0.0764

Note: I did not evaluate the results yet.
---
 configs/default_config.yaml    |  23 ++++--
 openevolve/config.py           | 126 +++++++++++++++++++++++++++------
 openevolve/controller.py       |  14 +++-
 openevolve/evaluator.py        |  93 ++++++++++++------------
 openevolve/llm/ensemble.py     |  46 ++++++------
 openevolve/llm/openai.py       |  43 +++++------
 openevolve/prompt/sampler.py   |  13 ++--
 openevolve/prompt/templates.py |  27 +++++++
 tests/test_valid_configs.py    |  34 +++++++++
 9 files changed, 295 insertions(+), 124 deletions(-)
 create mode 100644 tests/test_valid_configs.py

diff --git a/configs/default_config.yaml b/configs/default_config.yaml
index 4bc7558aa..22f086b81 100644
--- a/configs/default_config.yaml
+++ b/configs/default_config.yaml
@@ -16,13 +16,21 @@ max_code_length: 10000                # Maximum allowed code length in character
 
 # LLM configuration
 llm:
-  # Primary model (used most frequently)
-  primary_model: "gemini-2.0-flash-lite"
-  primary_model_weight: 0.8           # Sampling weight for primary model
-
-  # Secondary model (used for occasional high-quality generations)
-  secondary_model: "gemini-2.0-flash"
-  secondary_model_weight: 0.2         # Sampling weight for secondary model
+  # Models for evolution
+  models:
+    # List of available models with their weights
+    - name: "gemini-2.0-flash-lite"
+      weight: 0.8
+    - name: "gemini-2.0-flash"
+      weight: 0.2
+
+  # Models for LLM feedback
+  evaluator_models:
+    # List of available models with their weights
+    - name: "gemini-2.0-flash-lite"
+      weight: 0.8
+    - name: "gemini-2.0-flash"
+      weight: 0.2
 
   # API configuration
   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"  # Base URL for API (change for non-OpenAI models)
@@ -42,6 +50,7 @@ llm:
 prompt:
   template_dir: null                  # Custom directory for prompt templates
   system_message: "You are an expert coder helping to improve programs through evolution."
+  evaluator_system_message: "You are an expert code reviewer."
 
   # Number of examples to include in the prompt
   num_top_programs: 3                 # Number of top-performing programs to include
diff --git a/openevolve/config.py b/openevolve/config.py
index 460907ba4..c742ef945 100644
--- a/openevolve/config.py
+++ b/openevolve/config.py
@@ -11,22 +11,41 @@
 
 
 @dataclass
-class LLMConfig:
-    """Configuration for LLM models"""
+class LLMModelConfig:
+    """Configuration for a single LLM model"""
 
-    # Primary model
-    primary_model: str = "gemini-2.0-flash-lite"
-    primary_model_weight: float = 0.8
+    # API configuration
+    api_base: str = None
+    api_key: Optional[str] = None
+    name: str = None
 
-    # Secondary model
-    secondary_model: str = "gemini-2.0-flash"
-    secondary_model_weight: float = 0.2
+    # Weight for model in ensemble
+    weight: float = 1.0
+
+    # Generation parameters
+    system_message: Optional[str] = None
+    temperature: float = None
+    top_p: float = None
+    max_tokens: int = None
+
+    # Request parameters
+    timeout: int = None
+    retries: int = None
+    retry_delay: int = None
+
+
+@dataclass
+class LLMConfig(LLMModelConfig):
+    """Configuration for LLM models"""
 
     # API configuration
     api_base: str = "https://api.openai.com/v1"
-    api_key: Optional[str] = None
+    name: str = "gpt-4o"
 
     # Generation parameters
+    system_message: Optional[str] = (
+        "You are an expert coder helping to improve programs through evolution."
+    )
     temperature: float = 0.7
     top_p: float = 0.95
     max_tokens: int = 4096
@@ -36,6 +55,61 @@ class LLMConfig:
     retries: int = 3
     retry_delay: int = 5
 
+    # n-model configuration for evolution LLM ensemble
+    models: List[LLMModelConfig] = field(default_factory=lambda: [LLMModelConfig()])
+
+    # n-model configuration for evaluator LLM ensemble
+    evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: [])
+
+    # Backwardes compatibility with primary_model(_weight) options
+    primary_model: str = "gemini-2.0-flash-lite"
+    primary_model_weight: float = 0.8
+    secondary_model: str = "gemini-2.0-flash"
+    secondary_model_weight: float = 0.2
+
+    def __post_init__(self):
+        """Post-initialization to set up model configurations"""
+        # Handle backward compatibility for primary_model(_weight) and secondary_model(_weight).
+        if (self.primary_model or self.primary_model_weight) and len(self.models) < 1:
+            # Ensure we have a primary model
+            self.models.append(LLMModelConfig())
+        if self.primary_model:
+            self.models[0].name = self.primary_model
+        if self.primary_model_weight:
+            self.models[0].weight = self.primary_model_weight
+
+        if (self.secondary_model or self.secondary_model_weight) and len(self.models) < 2:
+            # Ensure we have a second model
+            self.models.append(LLMModelConfig())
+        if self.secondary_model:
+            self.models[1].name = self.secondary_model
+        if self.secondary_model_weight:
+            self.models[1].weight = self.secondary_model_weight
+
+        # If no evaluator models are defined, use the same models as for evolution
+        if not self.evaluator_models or len(self.evaluator_models) < 1:
+            self.evaluator_models = self.models.copy()
+
+        # Update models with shared configuration values
+        shared_config = {
+            "api_base": self.api_base,
+            "api_key": self.api_key,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "max_tokens": self.max_tokens,
+            "timeout": self.timeout,
+            "retries": self.retries,
+            "retry_delay": self.retry_delay,
+        }
+        self.update_model_params(shared_config)
+
+    def update_model_params(self, args: Dict[str, Any], overwrite: bool = False) -> None:
+        """Update model parameters for all models"""
+        for model in self.models + self.evaluator_models:
+            for key, value in args.items():
+                if overwrite or getattr(model, key, None) is None:
+                    setattr(model, key, value)
+
 
 @dataclass
 class PromptConfig:
@@ -43,6 +117,7 @@ class PromptConfig:
 
     template_dir: Optional[str] = None
     system_message: str = "You are an expert coder helping to improve programs through evolution."
+    evaluator_system_message: str = """You are an expert code reviewer."""
 
     # Number of examples to include in the prompt
     num_top_programs: int = 3
@@ -155,7 +230,12 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> "Config":
 
         # Update nested configs
         if "llm" in config_dict:
-            config.llm = LLMConfig(**config_dict["llm"])
+            llm_dict = config_dict["llm"]
+            if "models" in llm_dict:
+                llm_dict["models"] = [LLMModelConfig(**m) for m in llm_dict["models"]]
+            if "evaluator_models" in llm_dict:
+                llm_dict["evaluator_models"] = [LLMModelConfig(**m) for m in llm_dict["evaluator_models"]]
+            config.llm = LLMConfig(**llm_dict)
         if "prompt" in config_dict:
             config.prompt = PromptConfig(**config_dict["prompt"])
         if "database" in config_dict:
@@ -176,10 +256,8 @@ def to_dict(self) -> Dict[str, Any]:
             "random_seed": self.random_seed,
             # Component configurations
             "llm": {
-                "primary_model": self.llm.primary_model,
-                "primary_model_weight": self.llm.primary_model_weight,
-                "secondary_model": self.llm.secondary_model,
-                "secondary_model_weight": self.llm.secondary_model_weight,
+                "models": self.llm.models,
+                "evaluator_models": self.llm.evaluator_models,
                 "api_base": self.llm.api_base,
                 "temperature": self.llm.temperature,
                 "top_p": self.llm.top_p,
@@ -191,6 +269,7 @@ def to_dict(self) -> Dict[str, Any]:
             "prompt": {
                 "template_dir": self.prompt.template_dir,
                 "system_message": self.prompt.system_message,
+                "evaluator_system_message": self.prompt.evaluator_system_message,
                 "num_top_programs": self.prompt.num_top_programs,
                 "num_diverse_programs": self.prompt.num_diverse_programs,
                 "use_template_stochasticity": self.prompt.use_template_stochasticity,
@@ -245,16 +324,17 @@ def to_yaml(self, path: Union[str, Path]) -> None:
 def load_config(config_path: Optional[Union[str, Path]] = None) -> Config:
     """Load configuration from a YAML file or use defaults"""
     if config_path and os.path.exists(config_path):
-        return Config.from_yaml(config_path)
+        config = Config.from_yaml(config_path)
+    else:
+        config = Config()
+
+        # Use environment variables if available
+        api_key = os.environ.get("OPENAI_API_KEY")
+        api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
 
-    # Use environment variables if available
-    api_key = os.environ.get("OPENAI_API_KEY")
-    api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
+        config.llm.update_model_params({"api_key": api_key, "api_base": api_base})
 
-    config = Config()
-    if api_key:
-        config.llm.api_key = api_key
-    if api_base:
-        config.llm.api_base = api_base
+    # Make the system message available to the individual models, in case it is not provided from the prompt sampler
+    config.llm.update_model_params({"system_message": config.prompt.system_message})
 
     return config
diff --git a/openevolve/controller.py b/openevolve/controller.py
index 68d3e0c12..85eda97fc 100644
--- a/openevolve/controller.py
+++ b/openevolve/controller.py
@@ -92,15 +92,25 @@ def __init__(
                 self.file_extension = f".{self.file_extension}"
 
         # Initialize components
-        self.llm_ensemble = LLMEnsemble(self.config.llm)
+        self.llm_ensemble = LLMEnsemble(self.config.llm.models)
+        self.llm_evaluator_ensemble = LLMEnsemble(self.config.llm.evaluator_models)
+
         self.prompt_sampler = PromptSampler(self.config.prompt)
+        self.evaluator_prompt_sampler = PromptSampler(self.config.prompt)
+        self.evaluator_prompt_sampler.set_templates("evaluator_system_message")
 
         # Pass random seed to database if specified
         if self.config.random_seed is not None:
             self.config.database.random_seed = self.config.random_seed
 
         self.database = ProgramDatabase(self.config.database)
-        self.evaluator = Evaluator(self.config.evaluator, evaluation_file, self.llm_ensemble)
+
+        self.evaluator = Evaluator(
+            self.config.evaluator,
+            evaluation_file,
+            self.llm_evaluator_ensemble,
+            self.evaluator_prompt_sampler,
+        )
 
         logger.info(f"Initialized OpenEvolve with {initial_program_path} " f"and {evaluation_file}")
 
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
index d7f4ed654..7c093667d 100644
--- a/openevolve/evaluator.py
+++ b/openevolve/evaluator.py
@@ -14,10 +14,12 @@
 import uuid
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import traceback
 
 from openevolve.config import EvaluatorConfig
 from openevolve.llm.ensemble import LLMEnsemble
 from openevolve.utils.async_utils import TaskPool, run_in_executor
+from openevolve.prompt.sampler import PromptSampler
 
 logger = logging.getLogger(__name__)
 
@@ -35,10 +37,12 @@ def __init__(
         config: EvaluatorConfig,
         evaluation_file: str,
         llm_ensemble: Optional[LLMEnsemble] = None,
+        prompt_sampler: Optional[PromptSampler] = None,
     ):
         self.config = config
         self.evaluation_file = evaluation_file
         self.llm_ensemble = llm_ensemble
+        self.prompt_sampler = prompt_sampler
 
         # Create a task pool for parallel evaluation
         self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations)
@@ -285,30 +289,14 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
 
         try:
             # Create prompt for LLM
-            prompt = f"""
-            Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics:
-            1. Readability: How easy is the code to read and understand?
-            2. Maintainability: How easy would the code be to maintain and modify?
-            3. Efficiency: How efficient is the code in terms of time and space complexity?
-            
-            For each metric, provide a score between 0.0 and 1.0, where 1.0 is best.
-            
-            Code to evaluate:
-            ```python
-            {program_code}
-            ```
-            
-            Return your evaluation as a JSON object with the following format:
-            {{
-                "readability": [score],
-                "maintainability": [score],
-                "efficiency": [score],
-                "reasoning": "[brief explanation of scores]"
-            }}
-            """
+            prompt = self.prompt_sampler.build_prompt(
+                current_program=program_code, template_key="evaluation"
+            )
 
             # Get LLM response
-            response = await self.llm_ensemble.generate(prompt)
+            responses = await self.llm_ensemble.generate_all_with_context(
+                prompt["system"], [{"role": "user", "content": prompt["user"]}]
+            )
 
             # Extract JSON from response
             try:
@@ -316,36 +304,51 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                 json_pattern = r"```json\n(.*?)\n```"
                 import re
 
-                json_match = re.search(json_pattern, response, re.DOTALL)
-
-                if json_match:
-                    json_str = json_match.group(1)
-                else:
-                    # Try to extract JSON directly
-                    json_str = response
-                    # Remove non-JSON parts
-                    start_idx = json_str.find("{")
-                    end_idx = json_str.rfind("}") + 1
-                    if start_idx >= 0 and end_idx > start_idx:
-                        json_str = json_str[start_idx:end_idx]
-
-                # Parse JSON
-                result = json.loads(json_str)
-
-                # Extract metrics
-                metrics = {}
-                for key in ["readability", "maintainability", "efficiency"]:
-                    if key in result:
-                        metrics[key] = float(result[key])
-
-                return metrics
+                avg_metrics = {}
+                for i, response in enumerate(responses):
+                    json_match = re.search(json_pattern, response, re.DOTALL)
+
+                    if json_match:
+                        json_str = json_match.group(1)
+                    else:
+                        # Try to extract JSON directly
+                        json_str = response
+                        # Remove non-JSON parts
+                        start_idx = json_str.find("{")
+                        end_idx = json_str.rfind("}") + 1
+                        if start_idx >= 0 and end_idx > start_idx:
+                            json_str = json_str[start_idx:end_idx]
+
+                    # Parse JSON
+                    result = json.loads(json_str)
+
+                    # Filter all non-numeric values
+                    metrics = {
+                        name: float(value)
+                        for name, value in result.items()
+                        if isinstance(value, (int, float))
+                    }
+
+                    # Weight of the model in the ensemble
+                    weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
+
+                    # Average the metrics
+                    for name, value in metrics.items():
+                        if name in avg_metrics:
+                            avg_metrics[name] += value * weight
+                        else:
+                            avg_metrics[name] = value * weight
+
+                return avg_metrics
 
             except Exception as e:
                 logger.warning(f"Error parsing LLM response: {str(e)}")
+                traceback.print_exc()
                 return {}
 
         except Exception as e:
             logger.error(f"Error in LLM evaluation: {str(e)}")
+            traceback.print_exc()
             return {}
 
     def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool:
diff --git a/openevolve/llm/ensemble.py b/openevolve/llm/ensemble.py
index 0c518ccad..b286ff68e 100644
--- a/openevolve/llm/ensemble.py
+++ b/openevolve/llm/ensemble.py
@@ -7,37 +7,33 @@
 import random
 from typing import Dict, List, Optional, Tuple
 
-from openevolve.config import LLMConfig
 from openevolve.llm.base import LLMInterface
 from openevolve.llm.openai import OpenAILLM
+from openevolve.config import LLMModelConfig
 
 logger = logging.getLogger(__name__)
 
 
 class LLMEnsemble:
-    """Ensemble of LLMs for generating diverse code modifications"""
+    """Ensemble of LLMs"""
 
-    def __init__(self, config: LLMConfig):
-        self.config = config
+    def __init__(self, models_cfg: List[LLMModelConfig]):
+        self.models_cfg = models_cfg
 
-        # Initialize primary and secondary models
-        self.primary_model = OpenAILLM(config, model=config.primary_model)
-        self.secondary_model = OpenAILLM(config, model=config.secondary_model)
+        # Initialize models from the configuration
+        self.models = [OpenAILLM(model_cfg) for model_cfg in models_cfg]
 
-        # Model weights for sampling
-        self._weights = [
-            config.primary_model_weight,
-            config.secondary_model_weight,
-        ]
-
-        # Normalize weights
-        total = sum(self._weights)
-        self._weights = [w / total for w in self._weights]
+        # Extract and normalize model weights
+        self.weights = [model.weight for model in models_cfg]
+        total = sum(self.weights)
+        self.weights = [w / total for w in self.weights]
 
         logger.info(
             f"Initialized LLM ensemble with models: "
-            f"{config.primary_model} (weight: {self._weights[0]:.2f}), "
-            f"{config.secondary_model} (weight: {self._weights[1]:.2f})"
+            + ", ".join(
+                f"{model.name} (weight: {weight:.2f})"
+                for model, weight in zip(models_cfg, self.weights)
+            )
         )
 
     async def generate(self, prompt: str, **kwargs) -> str:
@@ -54,9 +50,8 @@ async def generate_with_context(
 
     def _sample_model(self) -> LLMInterface:
         """Sample a model from the ensemble based on weights"""
-        models = [self.primary_model, self.secondary_model]
-        index = random.choices(range(len(models)), weights=self._weights, k=1)[0]
-        return models[index]
+        index = random.choices(range(len(self.models)), weights=self.weights, k=1)[0]
+        return self.models[index]
 
     async def generate_multiple(self, prompt: str, n: int, **kwargs) -> List[str]:
         """Generate multiple texts in parallel"""
@@ -67,3 +62,12 @@ async def parallel_generate(self, prompts: List[str], **kwargs) -> List[str]:
         """Generate responses for multiple prompts in parallel"""
         tasks = [self.generate(prompt, **kwargs) for prompt in prompts]
         return await asyncio.gather(*tasks)
+
+    async def generate_all_with_context(
+        self, system_message: str, messages: List[Dict[str, str]], **kwargs
+    ) -> str:
+        """Generate text using a all available models and average their returned metrics"""
+        responses = []
+        for model in self.models:
+            responses.append(await model.generate_with_context(system_message, messages, **kwargs))
+        return responses
diff --git a/openevolve/llm/openai.py b/openevolve/llm/openai.py
index 9268b5703..c146ecc0c 100644
--- a/openevolve/llm/openai.py
+++ b/openevolve/llm/openai.py
@@ -20,26 +20,31 @@ class OpenAILLM(LLMInterface):
 
     def __init__(
         self,
-        config: LLMConfig,
-        model: Optional[str] = None,
+        model_cfg: Optional[dict] = None,
     ):
-        self.config = config
-        self.model = model or config.primary_model
+        self.model = model_cfg.name
+        self.system_message = model_cfg.system_message
+        self.temperature = model_cfg.temperature
+        self.top_p = model_cfg.top_p
+        self.max_tokens = model_cfg.max_tokens
+        self.timeout = model_cfg.timeout
+        self.retries = model_cfg.retries
+        self.retry_delay = model_cfg.retry_delay
+        self.api_base = model_cfg.api_base
+        self.api_key = model_cfg.api_key
 
         # Set up API client
         self.client = openai.OpenAI(
-            api_key=config.api_key,
-            base_url=config.api_base,
+            api_key=self.api_key,
+            base_url=self.api_base,
         )
 
         logger.info(f"Initialized OpenAI LLM with model: {self.model}")
 
     async def generate(self, prompt: str, **kwargs) -> str:
         """Generate text from a prompt"""
-        # Use default system message if not provided in kwargs
-        system_message = kwargs.pop("system_message", "You are a helpful assistant.")
         return await self.generate_with_context(
-            system_message=system_message,
+            system_message=self.system_message,
             messages=[{"role": "user", "content": prompt}],
             **kwargs,
         )
@@ -53,28 +58,26 @@ async def generate_with_context(
         formatted_messages.extend(messages)
 
         # Set up generation parameters
-        if self.config.api_base == "https://api.openai.com/v1" and str(
-            self.model
-        ).lower().startswith("o"):
+        if self.api_base == "https://api.openai.com/v1" and str(self.model).lower().startswith("o"):
             # For o-series models
             params = {
                 "model": self.model,
                 "messages": formatted_messages,
-                "max_completion_tokens": kwargs.get("max_tokens", self.config.max_tokens),
+                "max_completion_tokens": kwargs.get("max_tokens", self.max_tokens),
             }
         else:
             params = {
                 "model": self.model,
                 "messages": formatted_messages,
-                "temperature": kwargs.get("temperature", self.config.temperature),
-                "top_p": kwargs.get("top_p", self.config.top_p),
-                "max_tokens": kwargs.get("max_tokens", self.config.max_tokens),
+                "temperature": kwargs.get("temperature", self.temperature),
+                "top_p": kwargs.get("top_p", self.top_p),
+                "max_tokens": kwargs.get("max_tokens", self.max_tokens),
             }
 
         # Attempt the API call with retries
-        retries = kwargs.get("retries", self.config.retries)
-        retry_delay = kwargs.get("retry_delay", self.config.retry_delay)
-        timeout = kwargs.get("timeout", self.config.timeout)
+        retries = kwargs.get("retries", self.retries)
+        retry_delay = kwargs.get("retry_delay", self.retry_delay)
+        timeout = kwargs.get("timeout", self.timeout)
 
         for attempt in range(retries + 1):
             try:
@@ -104,6 +107,4 @@ async def _call_api(self, params: Dict[str, Any]) -> str:
         response = await loop.run_in_executor(
             None, lambda: self.client.chat.completions.create(**params)
         )
-
-        # Extract the response content
         return response.choices[0].message.content
diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py
index ad7a6be38..605c4b1da 100644
--- a/openevolve/prompt/sampler.py
+++ b/openevolve/prompt/sampler.py
@@ -44,15 +44,16 @@ def set_templates(
 
     def build_prompt(
         self,
-        current_program: str,
-        parent_program: str,
-        program_metrics: Dict[str, float],
-        previous_programs: List[Dict[str, Any]],
-        top_programs: List[Dict[str, Any]],
+        current_program: str = "",
+        parent_program: str = "",
+        program_metrics: Dict[str, float] = {},
+        previous_programs: List[Dict[str, Any]] = [],
+        top_programs: List[Dict[str, Any]] = [],
         language: str = "python",
         evolution_round: int = 0,
         allow_full_rewrite: bool = False,
         template_key: Optional[str] = None,
+        **kwargs: Any,
     ) -> Dict[str, str]:
         """
         Build a prompt for the LLM
@@ -67,6 +68,7 @@ def build_prompt(
             evolution_round: Current evolution round
             allow_full_rewrite: Whether to allow a full rewrite
             template_key: Optional override for template key
+            **kwargs: Additional keys to replace in the user prompt
 
         Returns:
             Dictionary with 'system' and 'user' keys
@@ -118,6 +120,7 @@ def build_prompt(
             evolution_history=evolution_history,
             current_program=current_program,
             language=language,
+            **kwargs,
         )
 
         return {
diff --git a/openevolve/prompt/templates.py b/openevolve/prompt/templates.py
index b50e34e9b..82d5a6b03 100644
--- a/openevolve/prompt/templates.py
+++ b/openevolve/prompt/templates.py
@@ -12,6 +12,9 @@
 Focus on making targeted changes that will increase the program's performance metrics.
 """
 
+BASE_EVALUATOR_SYSTEM_TEMPLATE = """You are an expert code reviewer.
+Your job is to analyze the provided code and evaluate it systematically."""
+
 # User message template for diff-based evolution
 DIFF_USER_TEMPLATE = """# Current Program Information
 - Current performance metrics: {metrics}
@@ -106,14 +109,38 @@
 Key features: {key_features}
 """
 
+# Template for evaluating a program via an LLM
+EVALUATION_TEMPLATE = """Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics:
+1. Readability: How easy is the code to read and understand?
+2. Maintainability: How easy would the code be to maintain and modify?
+3. Efficiency: How efficient is the code in terms of time and space complexity?
+
+For each metric, provide a score between 0.0 and 1.0, where 1.0 is best.
+
+Code to evaluate:
+```python
+{current_program}
+```
+
+Return your evaluation as a JSON object with the following format:
+{{
+    "readability": [score],
+    "maintainability": [score],
+    "efficiency": [score],
+    "reasoning": "[brief explanation of scores]"
+}}
+"""
+
 # Default templates dictionary
 DEFAULT_TEMPLATES = {
     "system_message": BASE_SYSTEM_TEMPLATE,
+    "evaluator_system_message": BASE_EVALUATOR_SYSTEM_TEMPLATE,
     "diff_user": DIFF_USER_TEMPLATE,
     "full_rewrite_user": FULL_REWRITE_USER_TEMPLATE,
     "evolution_history": EVOLUTION_HISTORY_TEMPLATE,
     "previous_attempt": PREVIOUS_ATTEMPT_TEMPLATE,
     "top_program": TOP_PROGRAM_TEMPLATE,
+    "evaluation": EVALUATION_TEMPLATE,
 }
 
 
diff --git a/tests/test_valid_configs.py b/tests/test_valid_configs.py
new file mode 100644
index 000000000..c34a3a373
--- /dev/null
+++ b/tests/test_valid_configs.py
@@ -0,0 +1,34 @@
+"""
+Confirming the validity of configuration files in project directories
+"""
+
+import os
+import unittest
+from unittest.mock import MagicMock, patch
+
+from openevolve.config import Config, load_config
+
+class TestConfigValidity(unittest.TestCase):
+    """Tests that all config files in the configs/ and examples/ directories are valid"""
+
+    def collect_files(self):
+        """Collect all config/*config*.yaml and examples/**/*config*.yaml files"""
+        config_dir = os.path.join(os.path.dirname(__file__), "../configs")
+        example_dir = os.path.join(os.path.dirname(__file__), "../examples")
+        config_files = []
+        for root, _, files in os.walk(config_dir):
+            for file in files:
+                if "config" in file and file.endswith(".yaml"):
+                    config_files.append(os.path.join(root, file))
+        return config_files
+
+    def test_import_config_files(self):
+        """Attempt to import all config files"""
+        config_files = self.collect_files()
+        for config_file in config_files:
+            print(f"Testing config file: {config_file}")
+            config = load_config(config_file)
+            self.assertIsInstance(config, Config, f"Config file {config_file} did not load correctly")
+
+if __name__ == "__main__":
+    unittest.main()

From 16dce114ac34f199d37e51b1ec1ed7ca0ed9d280 Mon Sep 17 00:00:00 2001
From: Julian <git@derjulian.net>
Date: Tue, 3 Jun 2025 00:03:44 +0200
Subject: [PATCH 2/2] lint

---
 openevolve/config.py        | 4 +++-
 tests/test_valid_configs.py | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/openevolve/config.py b/openevolve/config.py
index c742ef945..4252409cd 100644
--- a/openevolve/config.py
+++ b/openevolve/config.py
@@ -234,7 +234,9 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> "Config":
             if "models" in llm_dict:
                 llm_dict["models"] = [LLMModelConfig(**m) for m in llm_dict["models"]]
             if "evaluator_models" in llm_dict:
-                llm_dict["evaluator_models"] = [LLMModelConfig(**m) for m in llm_dict["evaluator_models"]]
+                llm_dict["evaluator_models"] = [
+                    LLMModelConfig(**m) for m in llm_dict["evaluator_models"]
+                ]
             config.llm = LLMConfig(**llm_dict)
         if "prompt" in config_dict:
             config.prompt = PromptConfig(**config_dict["prompt"])
diff --git a/tests/test_valid_configs.py b/tests/test_valid_configs.py
index c34a3a373..829d23b42 100644
--- a/tests/test_valid_configs.py
+++ b/tests/test_valid_configs.py
@@ -8,6 +8,7 @@
 
 from openevolve.config import Config, load_config
 
+
 class TestConfigValidity(unittest.TestCase):
     """Tests that all config files in the configs/ and examples/ directories are valid"""
 
@@ -28,7 +29,10 @@ def test_import_config_files(self):
         for config_file in config_files:
             print(f"Testing config file: {config_file}")
             config = load_config(config_file)
-            self.assertIsInstance(config, Config, f"Config file {config_file} did not load correctly")
+            self.assertIsInstance(
+                config, Config, f"Config file {config_file} did not load correctly"
+            )
+
 
 if __name__ == "__main__":
     unittest.main()