diff --git a/configs/default_config.yaml b/configs/default_config.yaml index 4bc7558aa..22f086b81 100644 --- a/configs/default_config.yaml +++ b/configs/default_config.yaml @@ -16,13 +16,21 @@ max_code_length: 10000 # Maximum allowed code length in character # LLM configuration llm: - # Primary model (used most frequently) - primary_model: "gemini-2.0-flash-lite" - primary_model_weight: 0.8 # Sampling weight for primary model - - # Secondary model (used for occasional high-quality generations) - secondary_model: "gemini-2.0-flash" - secondary_model_weight: 0.2 # Sampling weight for secondary model + # Models for evolution + models: + # List of available models with their weights + - name: "gemini-2.0-flash-lite" + weight: 0.8 + - name: "gemini-2.0-flash" + weight: 0.2 + + # Models for LLM feedback + evaluator_models: + # List of available models with their weights + - name: "gemini-2.0-flash-lite" + weight: 0.8 + - name: "gemini-2.0-flash" + weight: 0.2 # API configuration api_base: "https://generativelanguage.googleapis.com/v1beta/openai/" # Base URL for API (change for non-OpenAI models) @@ -42,6 +50,7 @@ llm: prompt: template_dir: null # Custom directory for prompt templates system_message: "You are an expert coder helping to improve programs through evolution." + evaluator_system_message: "You are an expert code reviewer." # Number of examples to include in the prompt num_top_programs: 3 # Number of top-performing programs to include diff --git a/openevolve/config.py b/openevolve/config.py index 460907ba4..4252409cd 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -11,22 +11,41 @@ @dataclass -class LLMConfig: - """Configuration for LLM models""" +class LLMModelConfig: + """Configuration for a single LLM model""" - # Primary model - primary_model: str = "gemini-2.0-flash-lite" - primary_model_weight: float = 0.8 + # API configuration + api_base: str = None + api_key: Optional[str] = None + name: str = None - # Secondary model - secondary_model: str = "gemini-2.0-flash" - secondary_model_weight: float = 0.2 + # Weight for model in ensemble + weight: float = 1.0 + + # Generation parameters + system_message: Optional[str] = None + temperature: float = None + top_p: float = None + max_tokens: int = None + + # Request parameters + timeout: int = None + retries: int = None + retry_delay: int = None + + +@dataclass +class LLMConfig(LLMModelConfig): + """Configuration for LLM models""" # API configuration api_base: str = "https://api.openai.com/v1" - api_key: Optional[str] = None + name: str = "gpt-4o" # Generation parameters + system_message: Optional[str] = ( + "You are an expert coder helping to improve programs through evolution." + ) temperature: float = 0.7 top_p: float = 0.95 max_tokens: int = 4096 @@ -36,6 +55,61 @@ class LLMConfig: retries: int = 3 retry_delay: int = 5 + # n-model configuration for evolution LLM ensemble + models: List[LLMModelConfig] = field(default_factory=lambda: [LLMModelConfig()]) + + # n-model configuration for evaluator LLM ensemble + evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: []) + + # Backwardes compatibility with primary_model(_weight) options + primary_model: str = "gemini-2.0-flash-lite" + primary_model_weight: float = 0.8 + secondary_model: str = "gemini-2.0-flash" + secondary_model_weight: float = 0.2 + + def __post_init__(self): + """Post-initialization to set up model configurations""" + # Handle backward compatibility for primary_model(_weight) and secondary_model(_weight). + if (self.primary_model or self.primary_model_weight) and len(self.models) < 1: + # Ensure we have a primary model + self.models.append(LLMModelConfig()) + if self.primary_model: + self.models[0].name = self.primary_model + if self.primary_model_weight: + self.models[0].weight = self.primary_model_weight + + if (self.secondary_model or self.secondary_model_weight) and len(self.models) < 2: + # Ensure we have a second model + self.models.append(LLMModelConfig()) + if self.secondary_model: + self.models[1].name = self.secondary_model + if self.secondary_model_weight: + self.models[1].weight = self.secondary_model_weight + + # If no evaluator models are defined, use the same models as for evolution + if not self.evaluator_models or len(self.evaluator_models) < 1: + self.evaluator_models = self.models.copy() + + # Update models with shared configuration values + shared_config = { + "api_base": self.api_base, + "api_key": self.api_key, + "temperature": self.temperature, + "top_p": self.top_p, + "max_tokens": self.max_tokens, + "timeout": self.timeout, + "retries": self.retries, + "retry_delay": self.retry_delay, + } + self.update_model_params(shared_config) + + def update_model_params(self, args: Dict[str, Any], overwrite: bool = False) -> None: + """Update model parameters for all models""" + for model in self.models + self.evaluator_models: + for key, value in args.items(): + if overwrite or getattr(model, key, None) is None: + setattr(model, key, value) + @dataclass class PromptConfig: @@ -43,6 +117,7 @@ class PromptConfig: template_dir: Optional[str] = None system_message: str = "You are an expert coder helping to improve programs through evolution." + evaluator_system_message: str = """You are an expert code reviewer.""" # Number of examples to include in the prompt num_top_programs: int = 3 @@ -155,7 +230,14 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> "Config": # Update nested configs if "llm" in config_dict: - config.llm = LLMConfig(**config_dict["llm"]) + llm_dict = config_dict["llm"] + if "models" in llm_dict: + llm_dict["models"] = [LLMModelConfig(**m) for m in llm_dict["models"]] + if "evaluator_models" in llm_dict: + llm_dict["evaluator_models"] = [ + LLMModelConfig(**m) for m in llm_dict["evaluator_models"] + ] + config.llm = LLMConfig(**llm_dict) if "prompt" in config_dict: config.prompt = PromptConfig(**config_dict["prompt"]) if "database" in config_dict: @@ -176,10 +258,8 @@ def to_dict(self) -> Dict[str, Any]: "random_seed": self.random_seed, # Component configurations "llm": { - "primary_model": self.llm.primary_model, - "primary_model_weight": self.llm.primary_model_weight, - "secondary_model": self.llm.secondary_model, - "secondary_model_weight": self.llm.secondary_model_weight, + "models": self.llm.models, + "evaluator_models": self.llm.evaluator_models, "api_base": self.llm.api_base, "temperature": self.llm.temperature, "top_p": self.llm.top_p, @@ -191,6 +271,7 @@ def to_dict(self) -> Dict[str, Any]: "prompt": { "template_dir": self.prompt.template_dir, "system_message": self.prompt.system_message, + "evaluator_system_message": self.prompt.evaluator_system_message, "num_top_programs": self.prompt.num_top_programs, "num_diverse_programs": self.prompt.num_diverse_programs, "use_template_stochasticity": self.prompt.use_template_stochasticity, @@ -245,16 +326,17 @@ def to_yaml(self, path: Union[str, Path]) -> None: def load_config(config_path: Optional[Union[str, Path]] = None) -> Config: """Load configuration from a YAML file or use defaults""" if config_path and os.path.exists(config_path): - return Config.from_yaml(config_path) + config = Config.from_yaml(config_path) + else: + config = Config() + + # Use environment variables if available + api_key = os.environ.get("OPENAI_API_KEY") + api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1") - # Use environment variables if available - api_key = os.environ.get("OPENAI_API_KEY") - api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1") + config.llm.update_model_params({"api_key": api_key, "api_base": api_base}) - config = Config() - if api_key: - config.llm.api_key = api_key - if api_base: - config.llm.api_base = api_base + # Make the system message available to the individual models, in case it is not provided from the prompt sampler + config.llm.update_model_params({"system_message": config.prompt.system_message}) return config diff --git a/openevolve/controller.py b/openevolve/controller.py index c08194378..466b6d779 100644 --- a/openevolve/controller.py +++ b/openevolve/controller.py @@ -96,15 +96,25 @@ def __init__( self.file_extension = f".{self.file_extension}" # Initialize components - self.llm_ensemble = LLMEnsemble(self.config.llm) + self.llm_ensemble = LLMEnsemble(self.config.llm.models) + self.llm_evaluator_ensemble = LLMEnsemble(self.config.llm.evaluator_models) + self.prompt_sampler = PromptSampler(self.config.prompt) + self.evaluator_prompt_sampler = PromptSampler(self.config.prompt) + self.evaluator_prompt_sampler.set_templates("evaluator_system_message") # Pass random seed to database if specified if self.config.random_seed is not None: self.config.database.random_seed = self.config.random_seed self.database = ProgramDatabase(self.config.database) - self.evaluator = Evaluator(self.config.evaluator, evaluation_file, self.llm_ensemble) + + self.evaluator = Evaluator( + self.config.evaluator, + evaluation_file, + self.llm_evaluator_ensemble, + self.evaluator_prompt_sampler, + ) logger.info(f"Initialized OpenEvolve with {initial_program_path} " f"and {evaluation_file}") diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index fb9fbd8e2..8a94bca9e 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -14,10 +14,12 @@ import uuid from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import traceback from openevolve.config import EvaluatorConfig from openevolve.llm.ensemble import LLMEnsemble from openevolve.utils.async_utils import TaskPool, run_in_executor +from openevolve.prompt.sampler import PromptSampler from openevolve.utils.format_utils import format_metrics_safe logger = logging.getLogger(__name__) @@ -36,10 +38,12 @@ def __init__( config: EvaluatorConfig, evaluation_file: str, llm_ensemble: Optional[LLMEnsemble] = None, + prompt_sampler: Optional[PromptSampler] = None, ): self.config = config self.evaluation_file = evaluation_file self.llm_ensemble = llm_ensemble + self.prompt_sampler = prompt_sampler # Create a task pool for parallel evaluation self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations) @@ -286,30 +290,14 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]: try: # Create prompt for LLM - prompt = f""" - Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics: - 1. Readability: How easy is the code to read and understand? - 2. Maintainability: How easy would the code be to maintain and modify? - 3. Efficiency: How efficient is the code in terms of time and space complexity? - - For each metric, provide a score between 0.0 and 1.0, where 1.0 is best. - - Code to evaluate: - ```python - {program_code} - ``` - - Return your evaluation as a JSON object with the following format: - {{ - "readability": [score], - "maintainability": [score], - "efficiency": [score], - "reasoning": "[brief explanation of scores]" - }} - """ + prompt = self.prompt_sampler.build_prompt( + current_program=program_code, template_key="evaluation" + ) # Get LLM response - response = await self.llm_ensemble.generate(prompt) + responses = await self.llm_ensemble.generate_all_with_context( + prompt["system"], [{"role": "user", "content": prompt["user"]}] + ) # Extract JSON from response try: @@ -317,36 +305,51 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]: json_pattern = r"```json\n(.*?)\n```" import re - json_match = re.search(json_pattern, response, re.DOTALL) - - if json_match: - json_str = json_match.group(1) - else: - # Try to extract JSON directly - json_str = response - # Remove non-JSON parts - start_idx = json_str.find("{") - end_idx = json_str.rfind("}") + 1 - if start_idx >= 0 and end_idx > start_idx: - json_str = json_str[start_idx:end_idx] - - # Parse JSON - result = json.loads(json_str) - - # Extract metrics - metrics = {} - for key in ["readability", "maintainability", "efficiency"]: - if key in result: - metrics[key] = float(result[key]) - - return metrics + avg_metrics = {} + for i, response in enumerate(responses): + json_match = re.search(json_pattern, response, re.DOTALL) + + if json_match: + json_str = json_match.group(1) + else: + # Try to extract JSON directly + json_str = response + # Remove non-JSON parts + start_idx = json_str.find("{") + end_idx = json_str.rfind("}") + 1 + if start_idx >= 0 and end_idx > start_idx: + json_str = json_str[start_idx:end_idx] + + # Parse JSON + result = json.loads(json_str) + + # Filter all non-numeric values + metrics = { + name: float(value) + for name, value in result.items() + if isinstance(value, (int, float)) + } + + # Weight of the model in the ensemble + weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0 + + # Average the metrics + for name, value in metrics.items(): + if name in avg_metrics: + avg_metrics[name] += value * weight + else: + avg_metrics[name] = value * weight + + return avg_metrics except Exception as e: logger.warning(f"Error parsing LLM response: {str(e)}") + traceback.print_exc() return {} except Exception as e: logger.error(f"Error in LLM evaluation: {str(e)}") + traceback.print_exc() return {} def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool: diff --git a/openevolve/llm/ensemble.py b/openevolve/llm/ensemble.py index 0c518ccad..b286ff68e 100644 --- a/openevolve/llm/ensemble.py +++ b/openevolve/llm/ensemble.py @@ -7,37 +7,33 @@ import random from typing import Dict, List, Optional, Tuple -from openevolve.config import LLMConfig from openevolve.llm.base import LLMInterface from openevolve.llm.openai import OpenAILLM +from openevolve.config import LLMModelConfig logger = logging.getLogger(__name__) class LLMEnsemble: - """Ensemble of LLMs for generating diverse code modifications""" + """Ensemble of LLMs""" - def __init__(self, config: LLMConfig): - self.config = config + def __init__(self, models_cfg: List[LLMModelConfig]): + self.models_cfg = models_cfg - # Initialize primary and secondary models - self.primary_model = OpenAILLM(config, model=config.primary_model) - self.secondary_model = OpenAILLM(config, model=config.secondary_model) + # Initialize models from the configuration + self.models = [OpenAILLM(model_cfg) for model_cfg in models_cfg] - # Model weights for sampling - self._weights = [ - config.primary_model_weight, - config.secondary_model_weight, - ] - - # Normalize weights - total = sum(self._weights) - self._weights = [w / total for w in self._weights] + # Extract and normalize model weights + self.weights = [model.weight for model in models_cfg] + total = sum(self.weights) + self.weights = [w / total for w in self.weights] logger.info( f"Initialized LLM ensemble with models: " - f"{config.primary_model} (weight: {self._weights[0]:.2f}), " - f"{config.secondary_model} (weight: {self._weights[1]:.2f})" + + ", ".join( + f"{model.name} (weight: {weight:.2f})" + for model, weight in zip(models_cfg, self.weights) + ) ) async def generate(self, prompt: str, **kwargs) -> str: @@ -54,9 +50,8 @@ async def generate_with_context( def _sample_model(self) -> LLMInterface: """Sample a model from the ensemble based on weights""" - models = [self.primary_model, self.secondary_model] - index = random.choices(range(len(models)), weights=self._weights, k=1)[0] - return models[index] + index = random.choices(range(len(self.models)), weights=self.weights, k=1)[0] + return self.models[index] async def generate_multiple(self, prompt: str, n: int, **kwargs) -> List[str]: """Generate multiple texts in parallel""" @@ -67,3 +62,12 @@ async def parallel_generate(self, prompts: List[str], **kwargs) -> List[str]: """Generate responses for multiple prompts in parallel""" tasks = [self.generate(prompt, **kwargs) for prompt in prompts] return await asyncio.gather(*tasks) + + async def generate_all_with_context( + self, system_message: str, messages: List[Dict[str, str]], **kwargs + ) -> str: + """Generate text using a all available models and average their returned metrics""" + responses = [] + for model in self.models: + responses.append(await model.generate_with_context(system_message, messages, **kwargs)) + return responses diff --git a/openevolve/llm/openai.py b/openevolve/llm/openai.py index 9268b5703..c146ecc0c 100644 --- a/openevolve/llm/openai.py +++ b/openevolve/llm/openai.py @@ -20,26 +20,31 @@ class OpenAILLM(LLMInterface): def __init__( self, - config: LLMConfig, - model: Optional[str] = None, + model_cfg: Optional[dict] = None, ): - self.config = config - self.model = model or config.primary_model + self.model = model_cfg.name + self.system_message = model_cfg.system_message + self.temperature = model_cfg.temperature + self.top_p = model_cfg.top_p + self.max_tokens = model_cfg.max_tokens + self.timeout = model_cfg.timeout + self.retries = model_cfg.retries + self.retry_delay = model_cfg.retry_delay + self.api_base = model_cfg.api_base + self.api_key = model_cfg.api_key # Set up API client self.client = openai.OpenAI( - api_key=config.api_key, - base_url=config.api_base, + api_key=self.api_key, + base_url=self.api_base, ) logger.info(f"Initialized OpenAI LLM with model: {self.model}") async def generate(self, prompt: str, **kwargs) -> str: """Generate text from a prompt""" - # Use default system message if not provided in kwargs - system_message = kwargs.pop("system_message", "You are a helpful assistant.") return await self.generate_with_context( - system_message=system_message, + system_message=self.system_message, messages=[{"role": "user", "content": prompt}], **kwargs, ) @@ -53,28 +58,26 @@ async def generate_with_context( formatted_messages.extend(messages) # Set up generation parameters - if self.config.api_base == "https://api.openai.com/v1" and str( - self.model - ).lower().startswith("o"): + if self.api_base == "https://api.openai.com/v1" and str(self.model).lower().startswith("o"): # For o-series models params = { "model": self.model, "messages": formatted_messages, - "max_completion_tokens": kwargs.get("max_tokens", self.config.max_tokens), + "max_completion_tokens": kwargs.get("max_tokens", self.max_tokens), } else: params = { "model": self.model, "messages": formatted_messages, - "temperature": kwargs.get("temperature", self.config.temperature), - "top_p": kwargs.get("top_p", self.config.top_p), - "max_tokens": kwargs.get("max_tokens", self.config.max_tokens), + "temperature": kwargs.get("temperature", self.temperature), + "top_p": kwargs.get("top_p", self.top_p), + "max_tokens": kwargs.get("max_tokens", self.max_tokens), } # Attempt the API call with retries - retries = kwargs.get("retries", self.config.retries) - retry_delay = kwargs.get("retry_delay", self.config.retry_delay) - timeout = kwargs.get("timeout", self.config.timeout) + retries = kwargs.get("retries", self.retries) + retry_delay = kwargs.get("retry_delay", self.retry_delay) + timeout = kwargs.get("timeout", self.timeout) for attempt in range(retries + 1): try: @@ -104,6 +107,4 @@ async def _call_api(self, params: Dict[str, Any]) -> str: response = await loop.run_in_executor( None, lambda: self.client.chat.completions.create(**params) ) - - # Extract the response content return response.choices[0].message.content diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py index a05910b98..6d543424b 100644 --- a/openevolve/prompt/sampler.py +++ b/openevolve/prompt/sampler.py @@ -46,15 +46,16 @@ def set_templates( def build_prompt( self, - current_program: str, - parent_program: str, - program_metrics: Dict[str, float], - previous_programs: List[Dict[str, Any]], - top_programs: List[Dict[str, Any]], + current_program: str = "", + parent_program: str = "", + program_metrics: Dict[str, float] = {}, + previous_programs: List[Dict[str, Any]] = [], + top_programs: List[Dict[str, Any]] = [], language: str = "python", evolution_round: int = 0, allow_full_rewrite: bool = False, template_key: Optional[str] = None, + **kwargs: Any, ) -> Dict[str, str]: """ Build a prompt for the LLM @@ -69,6 +70,7 @@ def build_prompt( evolution_round: Current evolution round allow_full_rewrite: Whether to allow a full rewrite template_key: Optional override for template key + **kwargs: Additional keys to replace in the user prompt Returns: Dictionary with 'system' and 'user' keys @@ -120,6 +122,7 @@ def build_prompt( evolution_history=evolution_history, current_program=current_program, language=language, + **kwargs, ) return { diff --git a/openevolve/prompt/templates.py b/openevolve/prompt/templates.py index b50e34e9b..82d5a6b03 100644 --- a/openevolve/prompt/templates.py +++ b/openevolve/prompt/templates.py @@ -12,6 +12,9 @@ Focus on making targeted changes that will increase the program's performance metrics. """ +BASE_EVALUATOR_SYSTEM_TEMPLATE = """You are an expert code reviewer. +Your job is to analyze the provided code and evaluate it systematically.""" + # User message template for diff-based evolution DIFF_USER_TEMPLATE = """# Current Program Information - Current performance metrics: {metrics} @@ -106,14 +109,38 @@ Key features: {key_features} """ +# Template for evaluating a program via an LLM +EVALUATION_TEMPLATE = """Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics: +1. Readability: How easy is the code to read and understand? +2. Maintainability: How easy would the code be to maintain and modify? +3. Efficiency: How efficient is the code in terms of time and space complexity? + +For each metric, provide a score between 0.0 and 1.0, where 1.0 is best. + +Code to evaluate: +```python +{current_program} +``` + +Return your evaluation as a JSON object with the following format: +{{ + "readability": [score], + "maintainability": [score], + "efficiency": [score], + "reasoning": "[brief explanation of scores]" +}} +""" + # Default templates dictionary DEFAULT_TEMPLATES = { "system_message": BASE_SYSTEM_TEMPLATE, + "evaluator_system_message": BASE_EVALUATOR_SYSTEM_TEMPLATE, "diff_user": DIFF_USER_TEMPLATE, "full_rewrite_user": FULL_REWRITE_USER_TEMPLATE, "evolution_history": EVOLUTION_HISTORY_TEMPLATE, "previous_attempt": PREVIOUS_ATTEMPT_TEMPLATE, "top_program": TOP_PROGRAM_TEMPLATE, + "evaluation": EVALUATION_TEMPLATE, } diff --git a/tests/test_valid_configs.py b/tests/test_valid_configs.py new file mode 100644 index 000000000..829d23b42 --- /dev/null +++ b/tests/test_valid_configs.py @@ -0,0 +1,38 @@ +""" +Confirming the validity of configuration files in project directories +""" + +import os +import unittest +from unittest.mock import MagicMock, patch + +from openevolve.config import Config, load_config + + +class TestConfigValidity(unittest.TestCase): + """Tests that all config files in the configs/ and examples/ directories are valid""" + + def collect_files(self): + """Collect all config/*config*.yaml and examples/**/*config*.yaml files""" + config_dir = os.path.join(os.path.dirname(__file__), "../configs") + example_dir = os.path.join(os.path.dirname(__file__), "../examples") + config_files = [] + for root, _, files in os.walk(config_dir): + for file in files: + if "config" in file and file.endswith(".yaml"): + config_files.append(os.path.join(root, file)) + return config_files + + def test_import_config_files(self): + """Attempt to import all config files""" + config_files = self.collect_files() + for config_file in config_files: + print(f"Testing config file: {config_file}") + config = load_config(config_file) + self.assertIsInstance( + config, Config, f"Config file {config_file} did not load correctly" + ) + + +if __name__ == "__main__": + unittest.main()