From f84be60af0c60ce0a0ec4cc883c1f697baa7dec5 Mon Sep 17 00:00:00 2001 From: Julian Date: Mon, 2 Jun 2025 23:57:58 +0200 Subject: [PATCH 1/2] Better support for LLM feedback and handling of LLM ensembles. - config.py supports configuration of n-model ensembles for evolution, and, optionally a separate ensemble for evaluation; backwards compatible yaml format; settings can be made for all models in llm: or for a specific model in llm:models; new evaluator_system_message setting - ensemble.py supports n model ensembles - OpenAILLM supports individual parameter config per model - ensemble.py has a new generate_all_with_context() function - evaluator.py uses prompt sampler to generate llm feedback prompts - templates.py contains default prompts for llm feedback With the function_minimization example, set use_llm_feedback: true in its config.yaml. The LLM feedback will provide output such as ` { "readability": 0.92, "maintainability": 0.88, "efficiency": 0.82, "reasoning": "The code is quite readable, with clear function and variable names, concise comments, and a docstring explaining the purpose and arguments of the main search function. There is some minor room for improvement, such as splitting up large inner loops or extracting repeated logic, but overall it is easy to follow. Maintainability is high due to modularization and descriptive naming, but could be slightly improved by reducing the nesting level and possibly moving the annealing routine to its own top-level function. Efficiency is good for a simple global optimization approach; vectorized numpy operations are used where appropriate, and the population-based simulated annealing is a reasonable trade-off between exploration and exploitation. However, the algorithm could be further optimized (e.g., by fully vectorizing more of the walker updates or parallelizing restarts), and the approach is not the most efficient for high-dimensional or more complex landscapes." } ` The evolution can then consider the additional values: ` Evolution complete! Best program metrics: runs_successfully: 1.0000 value_score: 0.9997 distance_score: 0.9991 overall_score: 0.9905 standard_deviation_score: 0.9992 speed_score: 0.0610 reliability_score: 1.0000 combined_score: 0.9525 success_rate: 1.0000 llm_readability: 0.0904 llm_maintainability: 0.0816 llm_efficiency: 0.0764 Note: I did not evaluate the results yet. --- configs/default_config.yaml | 23 ++++-- openevolve/config.py | 126 +++++++++++++++++++++++++++------ openevolve/controller.py | 14 +++- openevolve/evaluator.py | 93 ++++++++++++------------ openevolve/llm/ensemble.py | 46 ++++++------ openevolve/llm/openai.py | 43 +++++------ openevolve/prompt/sampler.py | 13 ++-- openevolve/prompt/templates.py | 27 +++++++ tests/test_valid_configs.py | 34 +++++++++ 9 files changed, 295 insertions(+), 124 deletions(-) create mode 100644 tests/test_valid_configs.py diff --git a/configs/default_config.yaml b/configs/default_config.yaml index 4bc7558aa..22f086b81 100644 --- a/configs/default_config.yaml +++ b/configs/default_config.yaml @@ -16,13 +16,21 @@ max_code_length: 10000 # Maximum allowed code length in character # LLM configuration llm: - # Primary model (used most frequently) - primary_model: "gemini-2.0-flash-lite" - primary_model_weight: 0.8 # Sampling weight for primary model - - # Secondary model (used for occasional high-quality generations) - secondary_model: "gemini-2.0-flash" - secondary_model_weight: 0.2 # Sampling weight for secondary model + # Models for evolution + models: + # List of available models with their weights + - name: "gemini-2.0-flash-lite" + weight: 0.8 + - name: "gemini-2.0-flash" + weight: 0.2 + + # Models for LLM feedback + evaluator_models: + # List of available models with their weights + - name: "gemini-2.0-flash-lite" + weight: 0.8 + - name: "gemini-2.0-flash" + weight: 0.2 # API configuration api_base: "https://generativelanguage.googleapis.com/v1beta/openai/" # Base URL for API (change for non-OpenAI models) @@ -42,6 +50,7 @@ llm: prompt: template_dir: null # Custom directory for prompt templates system_message: "You are an expert coder helping to improve programs through evolution." + evaluator_system_message: "You are an expert code reviewer." # Number of examples to include in the prompt num_top_programs: 3 # Number of top-performing programs to include diff --git a/openevolve/config.py b/openevolve/config.py index 460907ba4..c742ef945 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -11,22 +11,41 @@ @dataclass -class LLMConfig: - """Configuration for LLM models""" +class LLMModelConfig: + """Configuration for a single LLM model""" - # Primary model - primary_model: str = "gemini-2.0-flash-lite" - primary_model_weight: float = 0.8 + # API configuration + api_base: str = None + api_key: Optional[str] = None + name: str = None - # Secondary model - secondary_model: str = "gemini-2.0-flash" - secondary_model_weight: float = 0.2 + # Weight for model in ensemble + weight: float = 1.0 + + # Generation parameters + system_message: Optional[str] = None + temperature: float = None + top_p: float = None + max_tokens: int = None + + # Request parameters + timeout: int = None + retries: int = None + retry_delay: int = None + + +@dataclass +class LLMConfig(LLMModelConfig): + """Configuration for LLM models""" # API configuration api_base: str = "https://api.openai.com/v1" - api_key: Optional[str] = None + name: str = "gpt-4o" # Generation parameters + system_message: Optional[str] = ( + "You are an expert coder helping to improve programs through evolution." + ) temperature: float = 0.7 top_p: float = 0.95 max_tokens: int = 4096 @@ -36,6 +55,61 @@ class LLMConfig: retries: int = 3 retry_delay: int = 5 + # n-model configuration for evolution LLM ensemble + models: List[LLMModelConfig] = field(default_factory=lambda: [LLMModelConfig()]) + + # n-model configuration for evaluator LLM ensemble + evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: []) + + # Backwardes compatibility with primary_model(_weight) options + primary_model: str = "gemini-2.0-flash-lite" + primary_model_weight: float = 0.8 + secondary_model: str = "gemini-2.0-flash" + secondary_model_weight: float = 0.2 + + def __post_init__(self): + """Post-initialization to set up model configurations""" + # Handle backward compatibility for primary_model(_weight) and secondary_model(_weight). + if (self.primary_model or self.primary_model_weight) and len(self.models) < 1: + # Ensure we have a primary model + self.models.append(LLMModelConfig()) + if self.primary_model: + self.models[0].name = self.primary_model + if self.primary_model_weight: + self.models[0].weight = self.primary_model_weight + + if (self.secondary_model or self.secondary_model_weight) and len(self.models) < 2: + # Ensure we have a second model + self.models.append(LLMModelConfig()) + if self.secondary_model: + self.models[1].name = self.secondary_model + if self.secondary_model_weight: + self.models[1].weight = self.secondary_model_weight + + # If no evaluator models are defined, use the same models as for evolution + if not self.evaluator_models or len(self.evaluator_models) < 1: + self.evaluator_models = self.models.copy() + + # Update models with shared configuration values + shared_config = { + "api_base": self.api_base, + "api_key": self.api_key, + "temperature": self.temperature, + "top_p": self.top_p, + "max_tokens": self.max_tokens, + "timeout": self.timeout, + "retries": self.retries, + "retry_delay": self.retry_delay, + } + self.update_model_params(shared_config) + + def update_model_params(self, args: Dict[str, Any], overwrite: bool = False) -> None: + """Update model parameters for all models""" + for model in self.models + self.evaluator_models: + for key, value in args.items(): + if overwrite or getattr(model, key, None) is None: + setattr(model, key, value) + @dataclass class PromptConfig: @@ -43,6 +117,7 @@ class PromptConfig: template_dir: Optional[str] = None system_message: str = "You are an expert coder helping to improve programs through evolution." + evaluator_system_message: str = """You are an expert code reviewer.""" # Number of examples to include in the prompt num_top_programs: int = 3 @@ -155,7 +230,12 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> "Config": # Update nested configs if "llm" in config_dict: - config.llm = LLMConfig(**config_dict["llm"]) + llm_dict = config_dict["llm"] + if "models" in llm_dict: + llm_dict["models"] = [LLMModelConfig(**m) for m in llm_dict["models"]] + if "evaluator_models" in llm_dict: + llm_dict["evaluator_models"] = [LLMModelConfig(**m) for m in llm_dict["evaluator_models"]] + config.llm = LLMConfig(**llm_dict) if "prompt" in config_dict: config.prompt = PromptConfig(**config_dict["prompt"]) if "database" in config_dict: @@ -176,10 +256,8 @@ def to_dict(self) -> Dict[str, Any]: "random_seed": self.random_seed, # Component configurations "llm": { - "primary_model": self.llm.primary_model, - "primary_model_weight": self.llm.primary_model_weight, - "secondary_model": self.llm.secondary_model, - "secondary_model_weight": self.llm.secondary_model_weight, + "models": self.llm.models, + "evaluator_models": self.llm.evaluator_models, "api_base": self.llm.api_base, "temperature": self.llm.temperature, "top_p": self.llm.top_p, @@ -191,6 +269,7 @@ def to_dict(self) -> Dict[str, Any]: "prompt": { "template_dir": self.prompt.template_dir, "system_message": self.prompt.system_message, + "evaluator_system_message": self.prompt.evaluator_system_message, "num_top_programs": self.prompt.num_top_programs, "num_diverse_programs": self.prompt.num_diverse_programs, "use_template_stochasticity": self.prompt.use_template_stochasticity, @@ -245,16 +324,17 @@ def to_yaml(self, path: Union[str, Path]) -> None: def load_config(config_path: Optional[Union[str, Path]] = None) -> Config: """Load configuration from a YAML file or use defaults""" if config_path and os.path.exists(config_path): - return Config.from_yaml(config_path) + config = Config.from_yaml(config_path) + else: + config = Config() + + # Use environment variables if available + api_key = os.environ.get("OPENAI_API_KEY") + api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1") - # Use environment variables if available - api_key = os.environ.get("OPENAI_API_KEY") - api_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1") + config.llm.update_model_params({"api_key": api_key, "api_base": api_base}) - config = Config() - if api_key: - config.llm.api_key = api_key - if api_base: - config.llm.api_base = api_base + # Make the system message available to the individual models, in case it is not provided from the prompt sampler + config.llm.update_model_params({"system_message": config.prompt.system_message}) return config diff --git a/openevolve/controller.py b/openevolve/controller.py index 68d3e0c12..85eda97fc 100644 --- a/openevolve/controller.py +++ b/openevolve/controller.py @@ -92,15 +92,25 @@ def __init__( self.file_extension = f".{self.file_extension}" # Initialize components - self.llm_ensemble = LLMEnsemble(self.config.llm) + self.llm_ensemble = LLMEnsemble(self.config.llm.models) + self.llm_evaluator_ensemble = LLMEnsemble(self.config.llm.evaluator_models) + self.prompt_sampler = PromptSampler(self.config.prompt) + self.evaluator_prompt_sampler = PromptSampler(self.config.prompt) + self.evaluator_prompt_sampler.set_templates("evaluator_system_message") # Pass random seed to database if specified if self.config.random_seed is not None: self.config.database.random_seed = self.config.random_seed self.database = ProgramDatabase(self.config.database) - self.evaluator = Evaluator(self.config.evaluator, evaluation_file, self.llm_ensemble) + + self.evaluator = Evaluator( + self.config.evaluator, + evaluation_file, + self.llm_evaluator_ensemble, + self.evaluator_prompt_sampler, + ) logger.info(f"Initialized OpenEvolve with {initial_program_path} " f"and {evaluation_file}") diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index d7f4ed654..7c093667d 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -14,10 +14,12 @@ import uuid from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import traceback from openevolve.config import EvaluatorConfig from openevolve.llm.ensemble import LLMEnsemble from openevolve.utils.async_utils import TaskPool, run_in_executor +from openevolve.prompt.sampler import PromptSampler logger = logging.getLogger(__name__) @@ -35,10 +37,12 @@ def __init__( config: EvaluatorConfig, evaluation_file: str, llm_ensemble: Optional[LLMEnsemble] = None, + prompt_sampler: Optional[PromptSampler] = None, ): self.config = config self.evaluation_file = evaluation_file self.llm_ensemble = llm_ensemble + self.prompt_sampler = prompt_sampler # Create a task pool for parallel evaluation self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations) @@ -285,30 +289,14 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]: try: # Create prompt for LLM - prompt = f""" - Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics: - 1. Readability: How easy is the code to read and understand? - 2. Maintainability: How easy would the code be to maintain and modify? - 3. Efficiency: How efficient is the code in terms of time and space complexity? - - For each metric, provide a score between 0.0 and 1.0, where 1.0 is best. - - Code to evaluate: - ```python - {program_code} - ``` - - Return your evaluation as a JSON object with the following format: - {{ - "readability": [score], - "maintainability": [score], - "efficiency": [score], - "reasoning": "[brief explanation of scores]" - }} - """ + prompt = self.prompt_sampler.build_prompt( + current_program=program_code, template_key="evaluation" + ) # Get LLM response - response = await self.llm_ensemble.generate(prompt) + responses = await self.llm_ensemble.generate_all_with_context( + prompt["system"], [{"role": "user", "content": prompt["user"]}] + ) # Extract JSON from response try: @@ -316,36 +304,51 @@ async def _llm_evaluate(self, program_code: str) -> Dict[str, float]: json_pattern = r"```json\n(.*?)\n```" import re - json_match = re.search(json_pattern, response, re.DOTALL) - - if json_match: - json_str = json_match.group(1) - else: - # Try to extract JSON directly - json_str = response - # Remove non-JSON parts - start_idx = json_str.find("{") - end_idx = json_str.rfind("}") + 1 - if start_idx >= 0 and end_idx > start_idx: - json_str = json_str[start_idx:end_idx] - - # Parse JSON - result = json.loads(json_str) - - # Extract metrics - metrics = {} - for key in ["readability", "maintainability", "efficiency"]: - if key in result: - metrics[key] = float(result[key]) - - return metrics + avg_metrics = {} + for i, response in enumerate(responses): + json_match = re.search(json_pattern, response, re.DOTALL) + + if json_match: + json_str = json_match.group(1) + else: + # Try to extract JSON directly + json_str = response + # Remove non-JSON parts + start_idx = json_str.find("{") + end_idx = json_str.rfind("}") + 1 + if start_idx >= 0 and end_idx > start_idx: + json_str = json_str[start_idx:end_idx] + + # Parse JSON + result = json.loads(json_str) + + # Filter all non-numeric values + metrics = { + name: float(value) + for name, value in result.items() + if isinstance(value, (int, float)) + } + + # Weight of the model in the ensemble + weight = self.llm_ensemble.weights[i] if self.llm_ensemble.weights else 1.0 + + # Average the metrics + for name, value in metrics.items(): + if name in avg_metrics: + avg_metrics[name] += value * weight + else: + avg_metrics[name] = value * weight + + return avg_metrics except Exception as e: logger.warning(f"Error parsing LLM response: {str(e)}") + traceback.print_exc() return {} except Exception as e: logger.error(f"Error in LLM evaluation: {str(e)}") + traceback.print_exc() return {} def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool: diff --git a/openevolve/llm/ensemble.py b/openevolve/llm/ensemble.py index 0c518ccad..b286ff68e 100644 --- a/openevolve/llm/ensemble.py +++ b/openevolve/llm/ensemble.py @@ -7,37 +7,33 @@ import random from typing import Dict, List, Optional, Tuple -from openevolve.config import LLMConfig from openevolve.llm.base import LLMInterface from openevolve.llm.openai import OpenAILLM +from openevolve.config import LLMModelConfig logger = logging.getLogger(__name__) class LLMEnsemble: - """Ensemble of LLMs for generating diverse code modifications""" + """Ensemble of LLMs""" - def __init__(self, config: LLMConfig): - self.config = config + def __init__(self, models_cfg: List[LLMModelConfig]): + self.models_cfg = models_cfg - # Initialize primary and secondary models - self.primary_model = OpenAILLM(config, model=config.primary_model) - self.secondary_model = OpenAILLM(config, model=config.secondary_model) + # Initialize models from the configuration + self.models = [OpenAILLM(model_cfg) for model_cfg in models_cfg] - # Model weights for sampling - self._weights = [ - config.primary_model_weight, - config.secondary_model_weight, - ] - - # Normalize weights - total = sum(self._weights) - self._weights = [w / total for w in self._weights] + # Extract and normalize model weights + self.weights = [model.weight for model in models_cfg] + total = sum(self.weights) + self.weights = [w / total for w in self.weights] logger.info( f"Initialized LLM ensemble with models: " - f"{config.primary_model} (weight: {self._weights[0]:.2f}), " - f"{config.secondary_model} (weight: {self._weights[1]:.2f})" + + ", ".join( + f"{model.name} (weight: {weight:.2f})" + for model, weight in zip(models_cfg, self.weights) + ) ) async def generate(self, prompt: str, **kwargs) -> str: @@ -54,9 +50,8 @@ async def generate_with_context( def _sample_model(self) -> LLMInterface: """Sample a model from the ensemble based on weights""" - models = [self.primary_model, self.secondary_model] - index = random.choices(range(len(models)), weights=self._weights, k=1)[0] - return models[index] + index = random.choices(range(len(self.models)), weights=self.weights, k=1)[0] + return self.models[index] async def generate_multiple(self, prompt: str, n: int, **kwargs) -> List[str]: """Generate multiple texts in parallel""" @@ -67,3 +62,12 @@ async def parallel_generate(self, prompts: List[str], **kwargs) -> List[str]: """Generate responses for multiple prompts in parallel""" tasks = [self.generate(prompt, **kwargs) for prompt in prompts] return await asyncio.gather(*tasks) + + async def generate_all_with_context( + self, system_message: str, messages: List[Dict[str, str]], **kwargs + ) -> str: + """Generate text using a all available models and average their returned metrics""" + responses = [] + for model in self.models: + responses.append(await model.generate_with_context(system_message, messages, **kwargs)) + return responses diff --git a/openevolve/llm/openai.py b/openevolve/llm/openai.py index 9268b5703..c146ecc0c 100644 --- a/openevolve/llm/openai.py +++ b/openevolve/llm/openai.py @@ -20,26 +20,31 @@ class OpenAILLM(LLMInterface): def __init__( self, - config: LLMConfig, - model: Optional[str] = None, + model_cfg: Optional[dict] = None, ): - self.config = config - self.model = model or config.primary_model + self.model = model_cfg.name + self.system_message = model_cfg.system_message + self.temperature = model_cfg.temperature + self.top_p = model_cfg.top_p + self.max_tokens = model_cfg.max_tokens + self.timeout = model_cfg.timeout + self.retries = model_cfg.retries + self.retry_delay = model_cfg.retry_delay + self.api_base = model_cfg.api_base + self.api_key = model_cfg.api_key # Set up API client self.client = openai.OpenAI( - api_key=config.api_key, - base_url=config.api_base, + api_key=self.api_key, + base_url=self.api_base, ) logger.info(f"Initialized OpenAI LLM with model: {self.model}") async def generate(self, prompt: str, **kwargs) -> str: """Generate text from a prompt""" - # Use default system message if not provided in kwargs - system_message = kwargs.pop("system_message", "You are a helpful assistant.") return await self.generate_with_context( - system_message=system_message, + system_message=self.system_message, messages=[{"role": "user", "content": prompt}], **kwargs, ) @@ -53,28 +58,26 @@ async def generate_with_context( formatted_messages.extend(messages) # Set up generation parameters - if self.config.api_base == "https://api.openai.com/v1" and str( - self.model - ).lower().startswith("o"): + if self.api_base == "https://api.openai.com/v1" and str(self.model).lower().startswith("o"): # For o-series models params = { "model": self.model, "messages": formatted_messages, - "max_completion_tokens": kwargs.get("max_tokens", self.config.max_tokens), + "max_completion_tokens": kwargs.get("max_tokens", self.max_tokens), } else: params = { "model": self.model, "messages": formatted_messages, - "temperature": kwargs.get("temperature", self.config.temperature), - "top_p": kwargs.get("top_p", self.config.top_p), - "max_tokens": kwargs.get("max_tokens", self.config.max_tokens), + "temperature": kwargs.get("temperature", self.temperature), + "top_p": kwargs.get("top_p", self.top_p), + "max_tokens": kwargs.get("max_tokens", self.max_tokens), } # Attempt the API call with retries - retries = kwargs.get("retries", self.config.retries) - retry_delay = kwargs.get("retry_delay", self.config.retry_delay) - timeout = kwargs.get("timeout", self.config.timeout) + retries = kwargs.get("retries", self.retries) + retry_delay = kwargs.get("retry_delay", self.retry_delay) + timeout = kwargs.get("timeout", self.timeout) for attempt in range(retries + 1): try: @@ -104,6 +107,4 @@ async def _call_api(self, params: Dict[str, Any]) -> str: response = await loop.run_in_executor( None, lambda: self.client.chat.completions.create(**params) ) - - # Extract the response content return response.choices[0].message.content diff --git a/openevolve/prompt/sampler.py b/openevolve/prompt/sampler.py index ad7a6be38..605c4b1da 100644 --- a/openevolve/prompt/sampler.py +++ b/openevolve/prompt/sampler.py @@ -44,15 +44,16 @@ def set_templates( def build_prompt( self, - current_program: str, - parent_program: str, - program_metrics: Dict[str, float], - previous_programs: List[Dict[str, Any]], - top_programs: List[Dict[str, Any]], + current_program: str = "", + parent_program: str = "", + program_metrics: Dict[str, float] = {}, + previous_programs: List[Dict[str, Any]] = [], + top_programs: List[Dict[str, Any]] = [], language: str = "python", evolution_round: int = 0, allow_full_rewrite: bool = False, template_key: Optional[str] = None, + **kwargs: Any, ) -> Dict[str, str]: """ Build a prompt for the LLM @@ -67,6 +68,7 @@ def build_prompt( evolution_round: Current evolution round allow_full_rewrite: Whether to allow a full rewrite template_key: Optional override for template key + **kwargs: Additional keys to replace in the user prompt Returns: Dictionary with 'system' and 'user' keys @@ -118,6 +120,7 @@ def build_prompt( evolution_history=evolution_history, current_program=current_program, language=language, + **kwargs, ) return { diff --git a/openevolve/prompt/templates.py b/openevolve/prompt/templates.py index b50e34e9b..82d5a6b03 100644 --- a/openevolve/prompt/templates.py +++ b/openevolve/prompt/templates.py @@ -12,6 +12,9 @@ Focus on making targeted changes that will increase the program's performance metrics. """ +BASE_EVALUATOR_SYSTEM_TEMPLATE = """You are an expert code reviewer. +Your job is to analyze the provided code and evaluate it systematically.""" + # User message template for diff-based evolution DIFF_USER_TEMPLATE = """# Current Program Information - Current performance metrics: {metrics} @@ -106,14 +109,38 @@ Key features: {key_features} """ +# Template for evaluating a program via an LLM +EVALUATION_TEMPLATE = """Evaluate the following code on a scale of 0.0 to 1.0 for the following metrics: +1. Readability: How easy is the code to read and understand? +2. Maintainability: How easy would the code be to maintain and modify? +3. Efficiency: How efficient is the code in terms of time and space complexity? + +For each metric, provide a score between 0.0 and 1.0, where 1.0 is best. + +Code to evaluate: +```python +{current_program} +``` + +Return your evaluation as a JSON object with the following format: +{{ + "readability": [score], + "maintainability": [score], + "efficiency": [score], + "reasoning": "[brief explanation of scores]" +}} +""" + # Default templates dictionary DEFAULT_TEMPLATES = { "system_message": BASE_SYSTEM_TEMPLATE, + "evaluator_system_message": BASE_EVALUATOR_SYSTEM_TEMPLATE, "diff_user": DIFF_USER_TEMPLATE, "full_rewrite_user": FULL_REWRITE_USER_TEMPLATE, "evolution_history": EVOLUTION_HISTORY_TEMPLATE, "previous_attempt": PREVIOUS_ATTEMPT_TEMPLATE, "top_program": TOP_PROGRAM_TEMPLATE, + "evaluation": EVALUATION_TEMPLATE, } diff --git a/tests/test_valid_configs.py b/tests/test_valid_configs.py new file mode 100644 index 000000000..c34a3a373 --- /dev/null +++ b/tests/test_valid_configs.py @@ -0,0 +1,34 @@ +""" +Confirming the validity of configuration files in project directories +""" + +import os +import unittest +from unittest.mock import MagicMock, patch + +from openevolve.config import Config, load_config + +class TestConfigValidity(unittest.TestCase): + """Tests that all config files in the configs/ and examples/ directories are valid""" + + def collect_files(self): + """Collect all config/*config*.yaml and examples/**/*config*.yaml files""" + config_dir = os.path.join(os.path.dirname(__file__), "../configs") + example_dir = os.path.join(os.path.dirname(__file__), "../examples") + config_files = [] + for root, _, files in os.walk(config_dir): + for file in files: + if "config" in file and file.endswith(".yaml"): + config_files.append(os.path.join(root, file)) + return config_files + + def test_import_config_files(self): + """Attempt to import all config files""" + config_files = self.collect_files() + for config_file in config_files: + print(f"Testing config file: {config_file}") + config = load_config(config_file) + self.assertIsInstance(config, Config, f"Config file {config_file} did not load correctly") + +if __name__ == "__main__": + unittest.main() From 16dce114ac34f199d37e51b1ec1ed7ca0ed9d280 Mon Sep 17 00:00:00 2001 From: Julian Date: Tue, 3 Jun 2025 00:03:44 +0200 Subject: [PATCH 2/2] lint --- openevolve/config.py | 4 +++- tests/test_valid_configs.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/openevolve/config.py b/openevolve/config.py index c742ef945..4252409cd 100644 --- a/openevolve/config.py +++ b/openevolve/config.py @@ -234,7 +234,9 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> "Config": if "models" in llm_dict: llm_dict["models"] = [LLMModelConfig(**m) for m in llm_dict["models"]] if "evaluator_models" in llm_dict: - llm_dict["evaluator_models"] = [LLMModelConfig(**m) for m in llm_dict["evaluator_models"]] + llm_dict["evaluator_models"] = [ + LLMModelConfig(**m) for m in llm_dict["evaluator_models"] + ] config.llm = LLMConfig(**llm_dict) if "prompt" in config_dict: config.prompt = PromptConfig(**config_dict["prompt"]) diff --git a/tests/test_valid_configs.py b/tests/test_valid_configs.py index c34a3a373..829d23b42 100644 --- a/tests/test_valid_configs.py +++ b/tests/test_valid_configs.py @@ -8,6 +8,7 @@ from openevolve.config import Config, load_config + class TestConfigValidity(unittest.TestCase): """Tests that all config files in the configs/ and examples/ directories are valid""" @@ -28,7 +29,10 @@ def test_import_config_files(self): for config_file in config_files: print(f"Testing config file: {config_file}") config = load_config(config_file) - self.assertIsInstance(config, Config, f"Config file {config_file} did not load correctly") + self.assertIsInstance( + config, Config, f"Config file {config_file} did not load correctly" + ) + if __name__ == "__main__": unittest.main()