diff --git a/configs/default_config.yaml b/configs/default_config.yaml
index 4bc7558aa..22f086b81 100644
--- a/configs/default_config.yaml
+++ b/configs/default_config.yaml
@@ -16,13 +16,21 @@ max_code_length: 10000                # Maximum allowed code length in character
 
 # LLM configuration
 llm:
-  # Primary model (used most frequently)
-  primary_model: "gemini-2.0-flash-lite"
-  primary_model_weight: 0.8           # Sampling weight for primary model
-
-  # Secondary model (used for occasional high-quality generations)
-  secondary_model: "gemini-2.0-flash"
-  secondary_model_weight: 0.2         # Sampling weight for secondary model
+  # Models for evolution
+  models:
+    # List of available models with their weights
+    - name: "gemini-2.0-flash-lite"
+      weight: 0.8
+    - name: "gemini-2.0-flash"
+      weight: 0.2
+
+  # Models for LLM feedback
+  evaluator_models:
+    # List of available models with their weights
+    - name: "gemini-2.0-flash-lite"
+      weight: 0.8
+    - name: "gemini-2.0-flash"
+      weight: 0.2
 
   # API configuration
   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"  # Base URL for API (change for non-OpenAI models)
@@ -42,6 +50,7 @@ llm:
 prompt:
   template_dir: null                  # Custom directory for prompt templates
   system_message: "You are an expert coder helping to improve programs through evolution."
+  evaluator_system_message: "You are an expert code reviewer."
 
   # Number of examples to include in the prompt
   num_top_programs: 3                 # Number of top-performing programs to include
diff --git a/openevolve/config.py b/openevolve/config.py
index 460907ba4..4252409cd 100644
--- a/openevolve/config.py
+++ b/openevolve/config.py
@@ -11,22 +11,41 @@
 
 
 @dataclass
-class LLMConfig:
-    """Configuration for LLM models"""
+class LLMModelConfig:
+    """Configuration for a single LLM model"""
 
-    # Primary model
-    primary_model: str = "gemini-2.0-flash-lite"
-    primary_model_weight: float = 0.8
+    # API configuration
+    api_base: str = None
+    api_key: Optional[str] = None
+    name: str = None
 
-    # Secondary model
-    secondary_model: str = "gemini-2.0-flash"
-    secondary_model_weight: float = 0.2
+    # Weight for model in ensemble
+    weight: float = 1.0
+
+    # Generation parameters
+    system_message: Optional[str] = None
+    temperature: float = None
+    top_p: float = None
+    max_tokens: int = None
+
+    # Request parameters
+    timeout: int = None
+    retries: int = None
+    retry_delay: int = None
+
+
+@dataclass
+class LLMConfig(LLMModelConfig):
+    """Configuration for LLM models"""
 
     # API configuration
     api_base: str = "https://api.openai.com/v1"
-    api_key: Optional[str] = None
+    name: str = "gpt-4o"
 
     # Generation parameters
+    system_message: Optional[str] = (
+        "You are an expert coder helping to improve programs through evolution."
+    )
     temperature: float = 0.7
     top_p: float = 0.95
     max_tokens: int = 4096
@@ -36,6 +55,61 @@ class LLMConfig:
     retries: int = 3
     retry_delay: int = 5
 
+    # n-model configuration for evolution LLM ensemble
+    models: List[LLMModelConfig] = field(default_factory=lambda: [LLMModelConfig()])
+
+    # n-model configuration for evaluator LLM ensemble
+    evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: [])
+
+    # Backwardes compatibility with primary_model(_weight) options
+    primary_model: str = "gemini-2.0-flash-lite"
+    primary_model_weight: float = 0.8
+    secondary_model: str = "gemini-2.0-flash"
+    secondary_model_weight: float = 0.2
+
+    def __post_init__(self):
+        """Post-initialization to set up model configurations"""
+        # Handle backward compatibility for primary_model(_weight) and secondary_model(_weight).
+        if (self.primary_model or self.primary_model_weight) and len(self.models) < 1:
+            # Ensure we have a primary model
+            self.models.append(LLMModelConfig())
+        if self.primary_model:
+            self.models[0].name = self.primary_model
+        if self.primary_model_weight:
+            self.models[0].weight = self.primary_model_weight
+
+        if (self.secondary_model or self.secondary_model_weight) and len(self.models) < 2:
+            # Ensure we have a second model
+            self.models.append(LLMModelConfig())
+        if self.secondary_model:
+            self.models[1].name = self.secondary_model
+        if self.secondary_model_weight:
+            self.models[1].weight = self.secondary_model_weight
+
+        # If no evaluator models are defined, use the same models as for evolution
+        if not self.evaluator_models or len(self.evaluator_models) < 1:
+            self.evaluator_models = self.models.copy()
+
+        # Update models with shared configuration values
+        shared_config = {
+            "api_base": self.api_base,
+            "api_key": self.api_key,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "max_tokens": self.max_tokens,
+            "timeout": self.timeout,
+            "retries": self.retries,
+            "retry_delay": self.retry_delay,
+        }
+        self.update_model_params(shared_config)
+
+    def update_model_params(self, args: Dict[str, Any], overwrite: bool = False) -> None:
+        """Update model parameters for all models"""
+        for model in self.models + self.evaluator_models:
+            for key, value in args.items():
+                if overwrite or getattr(model, key, None) is None:
+                    setattr(model, key, value)
+
 
 @dataclass
 class PromptConfig:
@@ -43,6 +117,7 @@ class PromptConfig:
 
     template_dir: Optional[str] = None
     system_message: str = "You are an expert coder helping to improve programs through evolution."
+    evaluator_system_message: str = """You are an expert code reviewer."""
 
     # Number of examples to include in the prompt
     num_top_programs: int = 3
@@ -155,7 +230,14 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> "Config":
 
         # Update nested configs
         if "llm" in config_dict:
-            config.llm = LLMConfig(**config_dict["llm"])
+            llm_dict = config_dict["llm"]
+            if "models" in llm_dict:
+                llm_dict["models"] = [LLMModelConfig(**m) for m in llm_dict["models"]]
+            if "evaluator_models" in llm_dict:
+                llm_dict["evaluator_models"] = [
+                    LLMModelConfig(**m) for m in llm_dict["evaluator_models"]
+                ]
+            config.llm = LLMConfig(**llm_dict)
         if "prompt" in config_dict:
             config.prompt = PromptConfig(**config_dict["prompt"])
         if "database" in config_dict:
@@ -176,10 +258,8 @@ def to_dict(self) -> Dict[str, Any]:
             "random_seed": self.random_seed,
             # Component configurations
             "llm": {
-                "primary_model": self.llm.primary_model,
-                "primary_model_weight": self.llm.primary_model_weight,
-                "secondary_model": self.llm.secondary_model,
-                "secondary_model_weight": self.llm.secondary_model_weight,
+                "models": self.llm.models,
+                "evaluator_models": self.llm.evaluator_models,
                 "api_base": self.llm.api_base,
                 "temperature": self.llm.temperature,
                 "top_p": self.llm.top_p,
@@ -191,6 +271,7 @@ def to_dict(self) -> Dict[str, Any]:
             "prompt": {
                 "template_dir": self.prompt.template_dir,
                 "system_message": self.prompt.system_message,
+                "evaluator_system_message": self.prompt.evaluator_system_message,
                 "num_top_programs": self.prompt.num_top_programs,
                 "num_diverse_programs": self.prompt.num_diverse_programs,
                 "use_template_stochasticity": self.prompt.use_template_stochasticity,
@@ -245,16 +326,17 @@ def to_yaml(self, path: Union[str, Path]) -> None:
 def load_config(config_path: Optional[Union[str, Path]] = None) -> Config:
     """Load configuration from a YAML file or use defaults"""
     if config_path and os.path.exists(config_path):
-        return Config.from_yaml(config_path)
+        config = Config.from_yaml(config_path)
+    else:
+        config = Config()
+
+        # Use environment variables if available
+        api_key = os.environ.get("OPENAI_API_KEY")
+        api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
 
-    # Use environment variables if available
-    api_key = os.environ.get("OPENAI_API_KEY")
-    api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
+        config.llm.update_model_params({"api_key": api_key, "api_base": api_base})
 
-    config = Config()
-    if api_key:
-        config.llm.api_key = api_key
-    if api_base:
-        config.llm.api_base = api_base
+    # Make the system message available to the individual models, in case it is not provided from the prompt sampler
+    config.llm.update_model_params({"system_message": config.prompt.system_message})
 
     return config
diff --git a/openevolve/controller.py b/openevolve/controller.py
index c08194378..466b6d779 100644
--- a/openevolve/controller.py
+++ b/openevolve/controller.py
@@ -96,15 +96,25 @@ def __init__(
                 self.file_extension = f".{self.file_extension}"
 
         # Initialize components
-        self.llm_ensemble = LLMEnsemble(self.config.llm)
+        self.llm_ensemble = LLMEnsemble(self.config.llm.models)
+        self.llm_evaluator_ensemble = LLMEnsemble(self.config.llm.evaluator_models)
+
         self.prompt_sampler = PromptSampler(self.config.prompt)
+        self.evaluator_prompt_sampler = PromptSampler(self.config.prompt)
+        self.evaluator_prompt_sampler.set_templates("evaluator_system_message")
 
         # Pass random seed to database if specified
         if self.config.random_seed is not None:
             self.config.database.random_seed = self.config.random_seed
 
         self.database = ProgramDatabase(self.config.database)
-        self.evaluator = Evaluator(self.config.evaluator, evaluation_file, self.llm_ensemble)
+
+        self.evaluator = Evaluator(
+            self.config.evaluator,
+            evaluation_file,
+            self.llm_evaluator_ensemble,
+            self.evaluator_prompt_sampler,
+        )
 
         logger.info(f"Initialized OpenEvolve with {initial_program_path} " f"and {evaluation_file}")
 
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
index fb9fbd8e2..8a94bca9e 100644
--- a/openevolve/evaluator.py
+++ b/openevolve/evaluator.py
@@ -14,10 +14,12 @@
 import uuid
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import traceback
 
 from openevolve.config import EvaluatorConfig
 from openevolve.llm.ensemble import LLMEnsemble
 from openevolve.utils.async_utils import TaskPool, run_in_executor
+from openevolve.prompt.sampler import PromptSampler
 from openevolve.utils.format_utils import format_metrics_safe
 
 logger = logging.getLogger(__name__)
@@ -36,10 +38,12 @@ def __init__(
         config: EvaluatorConfig,
         evaluation_file: str,
         llm_ensemble: Optional[LLMEnsemble] = None,
+        prompt_sampler: Optional[PromptSampler] = None,
     ):
         self.config = config
         self.evaluation_file = evaluation_file
         self.llm_ensemble = llm_ensemble
+        self.prompt_sampler = prompt_sampler
 
         # Create a task pool for parallel evaluation
         self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations)
@@ -286,30 +290,14 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
 
         try:
             # Create prompt for LLM
-            prompt = f"""
-            Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics:
-            1. Readability: How easy is the code to read and understand?
-            2. Maintainability: How easy would the code be to maintain and modify?
-            3. Efficiency: How efficient is the code in terms of time and space complexity?
-            
-            For each metric, provide a score between 0.0 and 1.0, where 1.0 is best.
-            
-            Code to evaluate:
-            ```python
-            {program_code}
-            ```
-            
-            Return your evaluation as a JSON object with the following format:
-            {{
-                "readability": [score],
-                "maintainability": [score],
-                "efficiency": [score],
-                "reasoning": "[brief explanation of scores]"
-            }}
-            """
+            prompt = self.prompt_sampler.build_prompt(
+                current_program=program_code, template_key="evaluation"
+            )
 
             # Get LLM response
-            response = await self.llm_ensemble.generate(prompt)
+            responses = await self.llm_ensemble.generate_all_with_context(
+                prompt["system"], [{"role": "user", "content": prompt["user"]}]
+            )
 
             # Extract JSON from response
             try:
@@ -317,36 +305,51 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
                 json_pattern = r"```json\n(.*?)\n```"
                 import re
 
-                json_match = re.search(json_pattern, response, re.DOTALL)
-
-                if json_match:
-                    json_str = json_match.group(1)
-                else:
-                    # Try to extract JSON directly
-                    json_str = response
-                    # Remove non-JSON parts
-                    start_idx = json_str.find("{")
-                    end_idx = json_str.rfind("}") + 1
-                    if start_idx >= 0 and end_idx > start_idx:
-                        json_str = json_str[start_idx:end_idx]
-
-                # Parse JSON
-                result = json.loads(json_str)
-
-                # Extract metrics
-                metrics = {}
-                for key in ["readability", "maintainability", "efficiency"]:
-                    if key in result:
-                        metrics[key] = float(result[key])
-
-                return metrics
+                avg_metrics = {}
+                for i, response in enumerate(responses):
+                    json_match = re.search(json_pattern, response, re.DOTALL)
+
+                    if json_match:
+                        json_str = json_match.group(1)
+                    else:
+                        # Try to extract JSON directly
+                        json_str = response
+                        # Remove non-JSON parts
+                        start_idx = json_str.find("{")
+                        end_idx = json_str.rfind("}") + 1
+                        if start_idx >= 0 and end_idx > start_idx:
+                            json_str = json_str[start_idx:end_idx]
+
+                    # Parse JSON
+                    result = json.loads(json_str)
+
+                    # Filter all non-numeric values
+                    metrics = {
+                        name: float(value)
+                        for name, value in result.items()
+                        if isinstance(value, (int, float))
+                    }
+
+                    # Weight of the model in the ensemble
+                    weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
+
+                    # Average the metrics
+                    for name, value in metrics.items():
+                        if name in avg_metrics:
+                            avg_metrics[name] += value * weight
+                        else:
+                            avg_metrics[name] = value * weight
+
+                return avg_metrics
 
             except Exception as e:
                 logger.warning(f"Error parsing LLM response: {str(e)}")
+                traceback.print_exc()
                 return {}
 
         except Exception as e:
             logger.error(f"Error in LLM evaluation: {str(e)}")
+            traceback.print_exc()
             return {}
 
     def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool:
diff --git a/openevolve/llm/ensemble.py b/openevolve/llm/ensemble.py
index 0c518ccad..b286ff68e 100644
--- a/openevolve/llm/ensemble.py
+++ b/openevolve/llm/ensemble.py
@@ -7,37 +7,33 @@
 import random
 from typing import Dict, List, Optional, Tuple
 
-from openevolve.config import LLMConfig
 from openevolve.llm.base import LLMInterface
 from openevolve.llm.openai import OpenAILLM
+from openevolve.config import LLMModelConfig
 
 logger = logging.getLogger(__name__)
 
 
 class LLMEnsemble:
-    """Ensemble of LLMs for generating diverse code modifications"""
+    """Ensemble of LLMs"""
 
-    def __init__(self, config: LLMConfig):
-        self.config = config
+    def __init__(self, models_cfg: List[LLMModelConfig]):
+        self.models_cfg = models_cfg
 
-        # Initialize primary and secondary models
-        self.primary_model = OpenAILLM(config, model=config.primary_model)
-        self.secondary_model = OpenAILLM(config, model=config.secondary_model)
+        # Initialize models from the configuration
+        self.models = [OpenAILLM(model_cfg) for model_cfg in models_cfg]
 
-        # Model weights for sampling
-        self._weights = [
-            config.primary_model_weight,
-            config.secondary_model_weight,
-        ]
-
-        # Normalize weights
-        total = sum(self._weights)
-        self._weights = [w / total for w in self._weights]
+        # Extract and normalize model weights
+        self.weights = [model.weight for model in models_cfg]
+        total = sum(self.weights)
+        self.weights = [w / total for w in self.weights]
 
         logger.info(
             f"Initialized LLM ensemble with models: "
-            f"{config.primary_model} (weight: {self._weights[0]:.2f}), "
-            f"{config.secondary_model} (weight: {self._weights[1]:.2f})"
+            + ", ".join(
+                f"{model.name} (weight: {weight:.2f})"
+                for model, weight in zip(models_cfg, self.weights)
+            )
         )
 
     async def generate(self, prompt: str, **kwargs) -> str:
@@ -54,9 +50,8 @@ async def generate_with_context(
 
     def _sample_model(self) -> LLMInterface:
         """Sample a model from the ensemble based on weights"""
-        models = [self.primary_model, self.secondary_model]
-        index = random.choices(range(len(models)), weights=self._weights, k=1)[0]
-        return models[index]
+        index = random.choices(range(len(self.models)), weights=self.weights, k=1)[0]
+        return self.models[index]
 
     async def generate_multiple(self, prompt: str, n: int, **kwargs) -> List[str]:
         """Generate multiple texts in parallel"""
@@ -67,3 +62,12 @@ async def parallel_generate(self, prompts: List[str], **kwargs) -> List[str]:
         """Generate responses for multiple prompts in parallel"""
         tasks = [self.generate(prompt, **kwargs) for prompt in prompts]
         return await asyncio.gather(*tasks)
+
+    async def generate_all_with_context(
+        self, system_message: str, messages: List[Dict[str, str]], **kwargs
+    ) -> str:
+        """Generate text using a all available models and average their returned metrics"""
+        responses = []
+        for model in self.models:
+            responses.append(await model.generate_with_context(system_message, messages, **kwargs))
+        return responses
diff --git a/openevolve/llm/openai.py b/openevolve/llm/openai.py
index 9268b5703..c146ecc0c 100644
--- a/openevolve/llm/openai.py
+++ b/openevolve/llm/openai.py
@@ -20,26 +20,31 @@ class OpenAILLM(LLMInterface):
 
     def __init__(
         self,
-        config: LLMConfig,
-        model: Optional[str] = None,
+        model_cfg: Optional[dict] = None,
     ):
-        self.config = config
-        self.model = model or config.primary_model
+        self.model = model_cfg.name
+        self.system_message = model_cfg.system_message
+        self.temperature = model_cfg.temperature
+        self.top_p = model_cfg.top_p
+        self.max_tokens = model_cfg.max_tokens
+        self.timeout = model_cfg.timeout
+        self.retries = model_cfg.retries
+        self.retry_delay = model_cfg.retry_delay
+        self.api_base = model_cfg.api_base
+        self.api_key = model_cfg.api_key
 
         # Set up API client
         self.client = openai.OpenAI(
-            api_key=config.api_key,
-            base_url=config.api_base,
+            api_key=self.api_key,
+            base_url=self.api_base,
         )
 
         logger.info(f"Initialized OpenAI LLM with model: {self.model}")
 
     async def generate(self, prompt: str, **kwargs) -> str:
         """Generate text from a prompt"""
-        # Use default system message if not provided in kwargs
-        system_message = kwargs.pop("system_message", "You are a helpful assistant.")
         return await self.generate_with_context(
-            system_message=system_message,
+            system_message=self.system_message,
             messages=[{"role": "user", "content": prompt}],
             **kwargs,
         )
@@ -53,28 +58,26 @@ async def generate_with_context(
         formatted_messages.extend(messages)
 
         # Set up generation parameters
-        if self.config.api_base == "https://api.openai.com/v1" and str(
-            self.model
-        ).lower().startswith("o"):
+        if self.api_base == "https://api.openai.com/v1" and str(self.model).lower().startswith("o"):
             # For o-series models
             params = {
                 "model": self.model,
                 "messages": formatted_messages,
-                "max_completion_tokens": kwargs.get("max_tokens", self.config.max_tokens),
+                "max_completion_tokens": kwargs.get("max_tokens", self.max_tokens),
             }
         else:
             params = {
                 "model": self.model,
                 "messages": formatted_messages,
-                "temperature": kwargs.get("temperature", self.config.temperature),
-                "top_p": kwargs.get("top_p", self.config.top_p),
-                "max_tokens": kwargs.get("max_tokens", self.config.max_tokens),
+                "temperature": kwargs.get("temperature", self.temperature),
+                "top_p": kwargs.get("top_p", self.top_p),
+                "max_tokens": kwargs.get("max_tokens", self.max_tokens),
             }
 
         # Attempt the API call with retries
-        retries = kwargs.get("retries", self.config.retries)
-        retry_delay = kwargs.get("retry_delay", self.config.retry_delay)
-        timeout = kwargs.get("timeout", self.config.timeout)
+        retries = kwargs.get("retries", self.retries)
+        retry_delay = kwargs.get("retry_delay", self.retry_delay)
+        timeout = kwargs.get("timeout", self.timeout)
 
         for attempt in range(retries + 1):
             try:
@@ -104,6 +107,4 @@ async def _call_api(self, params: Dict[str, Any]) -> str:
         response = await loop.run_in_executor(
             None, lambda: self.client.chat.completions.create(**params)
         )
-
-        # Extract the response content
         return response.choices[0].message.content
diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py
index a05910b98..6d543424b 100644
--- a/openevolve/prompt/sampler.py
+++ b/openevolve/prompt/sampler.py
@@ -46,15 +46,16 @@ def set_templates(
 
     def build_prompt(
         self,
-        current_program: str,
-        parent_program: str,
-        program_metrics: Dict[str, float],
-        previous_programs: List[Dict[str, Any]],
-        top_programs: List[Dict[str, Any]],
+        current_program: str = "",
+        parent_program: str = "",
+        program_metrics: Dict[str, float] = {},
+        previous_programs: List[Dict[str, Any]] = [],
+        top_programs: List[Dict[str, Any]] = [],
         language: str = "python",
         evolution_round: int = 0,
         allow_full_rewrite: bool = False,
         template_key: Optional[str] = None,
+        **kwargs: Any,
     ) -> Dict[str, str]:
         """
         Build a prompt for the LLM
@@ -69,6 +70,7 @@ def build_prompt(
             evolution_round: Current evolution round
             allow_full_rewrite: Whether to allow a full rewrite
             template_key: Optional override for template key
+            **kwargs: Additional keys to replace in the user prompt
 
         Returns:
             Dictionary with 'system' and 'user' keys
@@ -120,6 +122,7 @@ def build_prompt(
             evolution_history=evolution_history,
             current_program=current_program,
             language=language,
+            **kwargs,
         )
 
         return {
diff --git a/openevolve/prompt/templates.py b/openevolve/prompt/templates.py
index b50e34e9b..82d5a6b03 100644
--- a/openevolve/prompt/templates.py
+++ b/openevolve/prompt/templates.py
@@ -12,6 +12,9 @@
 Focus on making targeted changes that will increase the program's performance metrics.
 """
 
+BASE_EVALUATOR_SYSTEM_TEMPLATE = """You are an expert code reviewer.
+Your job is to analyze the provided code and evaluate it systematically."""
+
 # User message template for diff-based evolution
 DIFF_USER_TEMPLATE = """# Current Program Information
 - Current performance metrics: {metrics}
@@ -106,14 +109,38 @@
 Key features: {key_features}
 """
 
+# Template for evaluating a program via an LLM
+EVALUATION_TEMPLATE = """Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics:
+1. Readability: How easy is the code to read and understand?
+2. Maintainability: How easy would the code be to maintain and modify?
+3. Efficiency: How efficient is the code in terms of time and space complexity?
+
+For each metric, provide a score between 0.0 and 1.0, where 1.0 is best.
+
+Code to evaluate:
+```python
+{current_program}
+```
+
+Return your evaluation as a JSON object with the following format:
+{{
+    "readability": [score],
+    "maintainability": [score],
+    "efficiency": [score],
+    "reasoning": "[brief explanation of scores]"
+}}
+"""
+
 # Default templates dictionary
 DEFAULT_TEMPLATES = {
     "system_message": BASE_SYSTEM_TEMPLATE,
+    "evaluator_system_message": BASE_EVALUATOR_SYSTEM_TEMPLATE,
     "diff_user": DIFF_USER_TEMPLATE,
     "full_rewrite_user": FULL_REWRITE_USER_TEMPLATE,
     "evolution_history": EVOLUTION_HISTORY_TEMPLATE,
     "previous_attempt": PREVIOUS_ATTEMPT_TEMPLATE,
     "top_program": TOP_PROGRAM_TEMPLATE,
+    "evaluation": EVALUATION_TEMPLATE,
 }
 
 
diff --git a/tests/test_valid_configs.py b/tests/test_valid_configs.py
new file mode 100644
index 000000000..829d23b42
--- /dev/null
+++ b/tests/test_valid_configs.py
@@ -0,0 +1,38 @@
+"""
+Confirming the validity of configuration files in project directories
+"""
+
+import os
+import unittest
+from unittest.mock import MagicMock, patch
+
+from openevolve.config import Config, load_config
+
+
+class TestConfigValidity(unittest.TestCase):
+    """Tests that all config files in the configs/ and examples/ directories are valid"""
+
+    def collect_files(self):
+        """Collect all config/*config*.yaml and examples/**/*config*.yaml files"""
+        config_dir = os.path.join(os.path.dirname(__file__), "../configs")
+        example_dir = os.path.join(os.path.dirname(__file__), "../examples")
+        config_files = []
+        for root, _, files in os.walk(config_dir):
+            for file in files:
+                if "config" in file and file.endswith(".yaml"):
+                    config_files.append(os.path.join(root, file))
+        return config_files
+
+    def test_import_config_files(self):
+        """Attempt to import all config files"""
+        config_files = self.collect_files()
+        for config_file in config_files:
+            print(f"Testing config file: {config_file}")
+            config = load_config(config_file)
+            self.assertIsInstance(
+                config, Config, f"Config file {config_file} did not load correctly"
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()