algorithmicsuperintelligence
diff --git a/‎configs/default_config.yaml‎
Lines changed: 16 additions & 7 deletions b/‎configs/default_config.yaml‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎openevolve/config.py‎
Lines changed: 105 additions & 23 deletions b/‎openevolve/config.py‎
Lines changed: 105 additions & 23 deletions
diff --git a/‎openevolve/controller.py‎
Lines changed: 12 additions & 2 deletions b/‎openevolve/controller.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎openevolve/evaluator.py‎
Lines changed: 48 additions & 45 deletions b/‎openevolve/evaluator.py‎
Lines changed: 48 additions & 45 deletions
@@ -16,13 +16,21 @@ max_code_length: 10000                # Maximum allowed code length in character
 
 # LLM configuration
 llm:
-  # Primary model (used most frequently)
-  primary_model: "gemini-2.0-flash-lite"
-  primary_model_weight: 0.8           # Sampling weight for primary model
-
-  # Secondary model (used for occasional high-quality generations)
-  secondary_model: "gemini-2.0-flash"
-  secondary_model_weight: 0.2         # Sampling weight for secondary model
+  # Models for evolution
+  models:
+    # List of available models with their weights
+    - name: "gemini-2.0-flash-lite"
+      weight: 0.8
+    - name: "gemini-2.0-flash"
+      weight: 0.2
+
+  # Models for LLM feedback
+  evaluator_models:
+    # List of available models with their weights
+    - name: "gemini-2.0-flash-lite"
+      weight: 0.8
+    - name: "gemini-2.0-flash"
+      weight: 0.2
 
   # API configuration
   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"  # Base URL for API (change for non-OpenAI models)
@@ -42,6 +50,7 @@ llm:
 prompt:
   template_dir: null                  # Custom directory for prompt templates
   system_message: "You are an expert coder helping to improve programs through evolution."
+  evaluator_system_message: "You are an expert code reviewer."
 
   # Number of examples to include in the prompt
   num_top_programs: 3                 # Number of top-performing programs to include
 
@@ -11,22 +11,41 @@
 
 
 @dataclass
-class LLMConfig:
-    """Configuration for LLM models"""
+class LLMModelConfig:
+    """Configuration for a single LLM model"""
 
-    # Primary model
-    primary_model: str = "gemini-2.0-flash-lite"
-    primary_model_weight: float = 0.8
+    # API configuration
+    api_base: str = None
+    api_key: Optional[str] = None
+    name: str = None
 
-    # Secondary model
-    secondary_model: str = "gemini-2.0-flash"
-    secondary_model_weight: float = 0.2
+    # Weight for model in ensemble
+    weight: float = 1.0
+
+    # Generation parameters
+    system_message: Optional[str] = None
+    temperature: float = None
+    top_p: float = None
+    max_tokens: int = None
+
+    # Request parameters
+    timeout: int = None
+    retries: int = None
+    retry_delay: int = None
+
+
+@dataclass
+class LLMConfig(LLMModelConfig):
+    """Configuration for LLM models"""
 
     # API configuration
     api_base: str = "https://api.openai.com/v1"
-    api_key: Optional[str] = None
+    name: str = "gpt-4o"
 
     # Generation parameters
+    system_message: Optional[str] = (
+        "You are an expert coder helping to improve programs through evolution."
+    )
     temperature: float = 0.7
     top_p: float = 0.95
     max_tokens: int = 4096
@@ -36,13 +55,69 @@ class LLMConfig:
     retries: int = 3
     retry_delay: int = 5
 
+    # n-model configuration for evolution LLM ensemble
+    models: List[LLMModelConfig] = field(default_factory=lambda: [LLMModelConfig()])
+
+    # n-model configuration for evaluator LLM ensemble
+    evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: [])
+
+    # Backwardes compatibility with primary_model(_weight) options
+    primary_model: str = "gemini-2.0-flash-lite"
+    primary_model_weight: float = 0.8
+    secondary_model: str = "gemini-2.0-flash"
+    secondary_model_weight: float = 0.2
+
+    def __post_init__(self):
+        """Post-initialization to set up model configurations"""
+        # Handle backward compatibility for primary_model(_weight) and secondary_model(_weight).
+        if (self.primary_model or self.primary_model_weight) and len(self.models) < 1:
+            # Ensure we have a primary model
+            self.models.append(LLMModelConfig())
+        if self.primary_model:
+            self.models[0].name = self.primary_model
+        if self.primary_model_weight:
+            self.models[0].weight = self.primary_model_weight
+
+        if (self.secondary_model or self.secondary_model_weight) and len(self.models) < 2:
+            # Ensure we have a second model
+            self.models.append(LLMModelConfig())
+        if self.secondary_model:
+            self.models[1].name = self.secondary_model
+        if self.secondary_model_weight:
+            self.models[1].weight = self.secondary_model_weight
+
+        # If no evaluator models are defined, use the same models as for evolution
+        if not self.evaluator_models or len(self.evaluator_models) < 1:
+            self.evaluator_models = self.models.copy()
+
+        # Update models with shared configuration values
+        shared_config = {
+            "api_base": self.api_base,
+            "api_key": self.api_key,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "max_tokens": self.max_tokens,
+            "timeout": self.timeout,
+            "retries": self.retries,
+            "retry_delay": self.retry_delay,
+        }
+        self.update_model_params(shared_config)
+
+    def update_model_params(self, args: Dict[str, Any], overwrite: bool = False) -> None:
+        """Update model parameters for all models"""
+        for model in self.models + self.evaluator_models:
+            for key, value in args.items():
+                if overwrite or getattr(model, key, None) is None:
+                    setattr(model, key, value)
+
 
 @dataclass
 class PromptConfig:
     """Configuration for prompt generation"""
 
     template_dir: Optional[str] = None
     system_message: str = "You are an expert coder helping to improve programs through evolution."
+    evaluator_system_message: str = """You are an expert code reviewer."""
 
     # Number of examples to include in the prompt
     num_top_programs: int = 3
@@ -155,7 +230,14 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> "Config":
 
         # Update nested configs
         if "llm" in config_dict:
-            config.llm = LLMConfig(**config_dict["llm"])
+            llm_dict = config_dict["llm"]
+            if "models" in llm_dict:
+                llm_dict["models"] = [LLMModelConfig(**m) for m in llm_dict["models"]]
+            if "evaluator_models" in llm_dict:
+                llm_dict["evaluator_models"] = [
+                    LLMModelConfig(**m) for m in llm_dict["evaluator_models"]
+                ]
+            config.llm = LLMConfig(**llm_dict)
         if "prompt" in config_dict:
             config.prompt = PromptConfig(**config_dict["prompt"])
         if "database" in config_dict:
@@ -176,10 +258,8 @@ def to_dict(self) -> Dict[str, Any]:
             "random_seed": self.random_seed,
             # Component configurations
             "llm": {
-                "primary_model": self.llm.primary_model,
-                "primary_model_weight": self.llm.primary_model_weight,
-                "secondary_model": self.llm.secondary_model,
-                "secondary_model_weight": self.llm.secondary_model_weight,
+                "models": self.llm.models,
+                "evaluator_models": self.llm.evaluator_models,
                 "api_base": self.llm.api_base,
                 "temperature": self.llm.temperature,
                 "top_p": self.llm.top_p,
@@ -191,6 +271,7 @@ def to_dict(self) -> Dict[str, Any]:
             "prompt": {
                 "template_dir": self.prompt.template_dir,
                 "system_message": self.prompt.system_message,
+                "evaluator_system_message": self.prompt.evaluator_system_message,
                 "num_top_programs": self.prompt.num_top_programs,
                 "num_diverse_programs": self.prompt.num_diverse_programs,
                 "use_template_stochasticity": self.prompt.use_template_stochasticity,
@@ -245,16 +326,17 @@ def to_yaml(self, path: Union[str, Path]) -> None:
 def load_config(config_path: Optional[Union[str, Path]] = None) -> Config:
     """Load configuration from a YAML file or use defaults"""
     if config_path and os.path.exists(config_path):
-        return Config.from_yaml(config_path)
+        config = Config.from_yaml(config_path)
+    else:
+        config = Config()
+
+        # Use environment variables if available
+        api_key = os.environ.get("OPENAI_API_KEY")
+        api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
 
-    # Use environment variables if available
-    api_key = os.environ.get("OPENAI_API_KEY")
-    api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
+        config.llm.update_model_params({"api_key": api_key, "api_base": api_base})
 
-    config = Config()
-    if api_key:
-        config.llm.api_key = api_key
-    if api_base:
-        config.llm.api_base = api_base
+    # Make the system message available to the individual models, in case it is not provided from the prompt sampler
+    config.llm.update_model_params({"system_message": config.prompt.system_message})
 
     return config
@@ -96,15 +96,25 @@ def __init__(
                 self.file_extension = f".{self.file_extension}"
 
         # Initialize components
-        self.llm_ensemble = LLMEnsemble(self.config.llm)
+        self.llm_ensemble = LLMEnsemble(self.config.llm.models)
+        self.llm_evaluator_ensemble = LLMEnsemble(self.config.llm.evaluator_models)
+
         self.prompt_sampler = PromptSampler(self.config.prompt)
+        self.evaluator_prompt_sampler = PromptSampler(self.config.prompt)
+        self.evaluator_prompt_sampler.set_templates("evaluator_system_message")
 
         # Pass random seed to database if specified
         if self.config.random_seed is not None:
             self.config.database.random_seed = self.config.random_seed
 
         self.database = ProgramDatabase(self.config.database)
-        self.evaluator = Evaluator(self.config.evaluator, evaluation_file, self.llm_ensemble)
+
+        self.evaluator = Evaluator(
+            self.config.evaluator,
+            evaluation_file,
+            self.llm_evaluator_ensemble,
+            self.evaluator_prompt_sampler,
+        )
 
         logger.info(f"Initialized OpenEvolve with {initial_program_path} " f"and {evaluation_file}")
 
 
@@ -14,10 +14,12 @@
 import uuid
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import traceback
 
 from openevolve.config import EvaluatorConfig
 from openevolve.llm.ensemble import LLMEnsemble
 from openevolve.utils.async_utils import TaskPool, run_in_executor
+from openevolve.prompt.sampler import PromptSampler
 from openevolve.utils.format_utils import format_metrics_safe
 
 logger = logging.getLogger(__name__)
@@ -36,10 +38,12 @@ def __init__(
         config: EvaluatorConfig,
         evaluation_file: str,
         llm_ensemble: Optional[LLMEnsemble] = None,
+        prompt_sampler: Optional[PromptSampler] = None,
     ):
         self.config = config
         self.evaluation_file = evaluation_file
         self.llm_ensemble = llm_ensemble
+        self.prompt_sampler = prompt_sampler
 
         # Create a task pool for parallel evaluation
         self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations)
@@ -286,67 +290,66 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]:
 
         try:
             # Create prompt for LLM
-            prompt = f"""
-            Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics:
-            1. Readability: How easy is the code to read and understand?
-            2. Maintainability: How easy would the code be to maintain and modify?
-            3. Efficiency: How efficient is the code in terms of time and space complexity?
-            
-            For each metric, provide a score between 0.0 and 1.0, where 1.0 is best.
-            
-            Code to evaluate:
-            ```python
-            {program_code}
-            ```
-            
-            Return your evaluation as a JSON object with the following format:
-            {{
-                "readability": [score],
-                "maintainability": [score],
-                "efficiency": [score],
-                "reasoning": "[brief explanation of scores]"
-            }}
-            """
+            prompt = self.prompt_sampler.build_prompt(
+                current_program=program_code, template_key="evaluation"
+            )
 
             # Get LLM response
-            response = await self.llm_ensemble.generate(prompt)
+            responses = await self.llm_ensemble.generate_all_with_context(
+                prompt["system"], [{"role": "user", "content": prompt["user"]}]
+            )
 
             # Extract JSON from response
             try:
                 # Try to find JSON block
                 json_pattern = r"```json\n(.*?)\n```"
                 import re
 
-                json_match = re.search(json_pattern, response, re.DOTALL)
-
-                if json_match:
-                    json_str = json_match.group(1)
-                else:
-                    # Try to extract JSON directly
-                    json_str = response
-                    # Remove non-JSON parts
-                    start_idx = json_str.find("{")
-                    end_idx = json_str.rfind("}") + 1
-                    if start_idx >= 0 and end_idx > start_idx:
-                        json_str = json_str[start_idx:end_idx]
-
-                # Parse JSON
-                result = json.loads(json_str)
-
-                # Extract metrics
-                metrics = {}
-                for key in ["readability", "maintainability", "efficiency"]:
-                    if key in result:
-                        metrics[key] = float(result[key])
-
-                return metrics
+                avg_metrics = {}
+                for i, response in enumerate(responses):
+                    json_match = re.search(json_pattern, response, re.DOTALL)
+
+                    if json_match:
+                        json_str = json_match.group(1)
+                    else:
+                        # Try to extract JSON directly
+                        json_str = response
+                        # Remove non-JSON parts
+                        start_idx = json_str.find("{")
+                        end_idx = json_str.rfind("}") + 1
+                        if start_idx >= 0 and end_idx > start_idx:
+                            json_str = json_str[start_idx:end_idx]
+
+                    # Parse JSON
+                    result = json.loads(json_str)
+
+                    # Filter all non-numeric values
+                    metrics = {
+                        name: float(value)
+                        for name, value in result.items()
+                        if isinstance(value, (int, float))
+                    }
+
+                    # Weight of the model in the ensemble
+                    weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0
+
+                    # Average the metrics
+                    for name, value in metrics.items():
+                        if name in avg_metrics:
+                            avg_metrics[name] += value * weight
+                        else:
+                            avg_metrics[name] = value * weight
+
+                return avg_metrics
 
             except Exception as e:
                 logger.warning(f"Error parsing LLM response: {str(e)}")
+                traceback.print_exc()
                 return {}
 
         except Exception as e:
             logger.error(f"Error in LLM evaluation: {str(e)}")
+            traceback.print_exc()
             return {}
 
     def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool: