diff --git a/examples/online_judge_programming_with_eval_obj/README.md b/examples/online_judge_programming_with_eval_obj/README.md new file mode 100644 index 000000000..69c2a2856 --- /dev/null +++ b/examples/online_judge_programming_with_eval_obj/README.md @@ -0,0 +1,14 @@ +# Online Judge Programming With Evaluation Objects Example + +This example is a variant of the [online judge example](https://github.com/codelion/openevolve/tree/main/examples/online_judge_programming) demonstrating the use of evaluation objects. See its documentation for the problem description. + +## Running the example + +First, fill your username and token in `example.kattisrc` according to your personal configuration file (must be logged in) from [Kattis](https://open.kattis.com/download/kattisrc) and rename the file as `.kittisrc`. + +Then, to run this example: + +```bash +cd examples/online_judge_programming_with_eval_obj +python main.py +``` diff --git a/examples/online_judge_programming_with_eval_obj/config.yaml b/examples/online_judge_programming_with_eval_obj/config.yaml new file mode 100644 index 000000000..7f640eb7c --- /dev/null +++ b/examples/online_judge_programming_with_eval_obj/config.yaml @@ -0,0 +1,71 @@ +# Configuration for function minimization example +max_iterations: 20 +checkpoint_interval: 1 +log_level: "INFO" + +# LLM configuration +llm: + primary_model: "gemini-2.0-flash" + primary_model_weight: 0.6 + secondary_model: "gemini-2.5-flash-preview-05-20" + secondary_model_weight: 0.4 + api_base: "https://generativelanguage.googleapis.com/v1beta/openai/" + api_key: YOUR_API_KEY + temperature: 0.7 + top_p: 0.95 + max_tokens: 4096 + +# Prompt configuration +prompt: + system_message: | + You are an expert programmer. Your task is to implement an algorithm in Python to pass all the test cases. The problem is as follows: + + A string of lowercase letters is called alphabetical if some of the letters can be deleted so that the only letters that remain are the letters from a to z in order. Given a string s, determine the minimum number of letters to add anywhere in the string to make it alphabetical. + + Input: + Each input will consist of a single test case. Note that your program may be run multiple times on different inputs. The only line of input contains a string s (1 ≤ |s| ≤ 50) which contains only lowercase letters. + Output: + Output a single integer, which is the smallest number of letters needed to add to s to make it alphabetical. + + Sample Input 1: + xyzabcdefghijklmnopqrstuvw + Sample Output 1: + 3 + + Sample Input 2: + aiemckgobjfndlhp + Sample Output 2: + 20 + + Your program should always read/write to STDIN/STDOUT. For example, to handle integer input, use the following format: + ``` + import sys + for line in sys.stdin: + data = int(line) + ``` + Use print() for output. For example: + ``` + print("Hello, World!") + ``` + num_top_programs: 3 + use_template_stochasticity: true + +# Database configuration +database: + population_size: 50 + archive_size: 20 + num_islands: 3 + elite_selection_ratio: 0.2 + exploitation_ratio: 0.7 + +# Evaluator configuration +evaluator: + timeout: 60 + cascade_evaluation: false + cascade_thresholds: [1.0] + parallel_evaluations: 4 + use_llm_feedback: false + +# Evolution settings +diff_based_evolution: true +allow_full_rewrites: false diff --git a/examples/online_judge_programming_with_eval_obj/evaluator.py b/examples/online_judge_programming_with_eval_obj/evaluator.py new file mode 100644 index 000000000..1e5954b23 --- /dev/null +++ b/examples/online_judge_programming_with_eval_obj/evaluator.py @@ -0,0 +1,106 @@ +""" +Evaluator for the function minimization example +""" + +import re +import subprocess +import time +import traceback + + +def run_with_timeout(program_path, problem_name, timeout_seconds=60): + """ + Run a function with a timeout using subprocess. + + Args: + program_path: Program to submit + problem_name: Short name of the problem to submit to + timeout_seconds: Timeout in seconds + + Returns: + Result of the function or raises TimeoutError + """ + cmd = ["python", "submit.py", program_path, "-p", problem_name, "-l", "Python 3", "-f"] + + try: + # Run the command and grab its output using subprocess.Popen + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + stdout, stderr = proc.communicate(timeout=timeout_seconds) + exit_code = proc.returncode + if exit_code != 0: + print(stderr) # Print the error output if the command failed + raise RuntimeError(f"Process exited with code {exit_code}") + except subprocess.TimeoutExpired: + # Kill the process if it times out + proc.kill() + raise TimeoutError(f"Process timed out after {timeout_seconds} seconds") + + pattern = ( + r"Score:\s*(\d+)\s*" + r"Test cases done:\s*(\d+)\s*" + r"Test cases correct:\s*(\d+)\s*" + r"Test cases total:\s*(\d+)" + ) + match = re.search(pattern, stdout) + if not match: + raise ValueError("Expected summary lines not found") + + score, done, correct, total = map(int, match.groups()) + return score, done, correct, total + + +class EvaluationObject: + def __init__(self, problem_name: str, timeout_seconds: int): + self.problem_name = problem_name + self.timeout_seconds = timeout_seconds + + def evaluate(self, program_path): + """ + Evaluate the program by submitting it to OJ and fetching metrics based on how well it performs. + + Args: + program_path: Path to the program file + + Returns: + Dictionary of metrics + """ + try: + # For constructor-based approaches, a single evaluation is sufficient + # since the result is deterministic + start_time = time.time() + + # Use subprocess to run with timeout + score, done, correct, total = run_with_timeout( + program_path, self.problem_name, self.timeout_seconds + ) + + end_time = time.time() + eval_time = end_time - start_time + + # Combined score - higher is better + combined_score = correct / total if total > 0 else 0.0 + + print( + f"Evaluation: Score={score}, Done={done}, Correct={correct}, Total={total}, Combined={combined_score:.2f}" + ) + + return { + "score": score, + "done": done, + "correct": correct, + "total": total, + "eval_time": eval_time, + "combined_score": float(combined_score), + } + + except Exception as e: + print(f"Evaluation failed completely: {str(e)}") + traceback.print_exc() + return { + "score": 0, + "done": 0, + "correct": 0, + "total": 0, + "eval_time": 0.0, + "combined_score": 0.0, + } diff --git a/examples/online_judge_programming_with_eval_obj/example.kattisrc b/examples/online_judge_programming_with_eval_obj/example.kattisrc new file mode 100644 index 000000000..249859bde --- /dev/null +++ b/examples/online_judge_programming_with_eval_obj/example.kattisrc @@ -0,0 +1,14 @@ +# Please save this file as .kattisrc in your home directory. +# This file includes a secret token that allows you to log in. +# DO NOT SHARE IT WITH ANYONE ELSE. +# If someone gets access to this token, please revoke it by changing your KATTIS password. + +[user] +username: YOUR_USERNAME +token: YOUR_TOKEN + +[kattis] +hostname: open.kattis.com +loginurl: https://open.kattis.com/login +submissionurl: https://open.kattis.com/submit +submissionsurl: https://open.kattis.com/submissions diff --git a/examples/online_judge_programming_with_eval_obj/initial_program.py b/examples/online_judge_programming_with_eval_obj/initial_program.py new file mode 100644 index 000000000..936b6845e --- /dev/null +++ b/examples/online_judge_programming_with_eval_obj/initial_program.py @@ -0,0 +1,12 @@ +"""Online judge programming example for OpenEvolve""" + +# EVOLVE-BLOCK-START +import sys + +for line in sys.stdin: + s = line.strip() + +ans = 0 +print(ans) + +# EVOLVE-BLOCK-END diff --git a/examples/online_judge_programming_with_eval_obj/main.py b/examples/online_judge_programming_with_eval_obj/main.py new file mode 100644 index 000000000..b86f44b25 --- /dev/null +++ b/examples/online_judge_programming_with_eval_obj/main.py @@ -0,0 +1,23 @@ +from argparse import ArgumentParser + +from openevolve import OpenEvolve +from evaluator import EvaluationObject + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument( + "-p", + "--problem", + help="Which problem to solve", + ) + parser.add_argument( + "-t", + "--timeout", + help="Timeout for a single submission (in seconds)", + type=int, + default=60, + ) + + args = parser.parse_args() + eval_obj = EvaluationObject(args.problem, args.timeout) + evolve = OpenEvolve("initial_program.py", "", eval_obj, "config.yaml") \ No newline at end of file diff --git a/examples/online_judge_programming_with_eval_obj/requirements.txt b/examples/online_judge_programming_with_eval_obj/requirements.txt new file mode 100644 index 000000000..7cdc7eaad --- /dev/null +++ b/examples/online_judge_programming_with_eval_obj/requirements.txt @@ -0,0 +1,2 @@ +lxml +requests \ No newline at end of file diff --git a/examples/online_judge_programming_with_eval_obj/submit.py b/examples/online_judge_programming_with_eval_obj/submit.py new file mode 100755 index 000000000..c2799824a --- /dev/null +++ b/examples/online_judge_programming_with_eval_obj/submit.py @@ -0,0 +1,603 @@ +#!/usr/bin/env python +import argparse +import configparser +import os +import re +import requests +import sys +import time + +from lxml.html import fragment_fromstring + +_DEFAULT_CONFIG = "/usr/local/etc/kattisrc" +_LANGUAGE_GUESS = { + ".4th": "Forth", + ".c": "C", + ".c++": "C++", + ".cc": "C++", + ".c#": "C#", + ".cpp": "C++", + ".cs": "C#", + ".cxx": "C++", + ".cbl": "COBOL", + ".cob": "COBOL", + ".cpy": "COBOL", + ".forth": "Forth", + ".frt": "Forth", + ".fs": "F#", + ".fth": "Forth", + ".go": "Go", + ".hs": "Haskell", + ".java": "Java", + ".js": "JavaScript (Node.js)", + ".ts": "TypeScript", + ".kt": "Kotlin", + ".lisp": "Common Lisp", + ".cl": "Common Lisp", + ".m": "Objective-C", + ".ml": "OCaml", + ".pas": "Pascal", + ".php": "PHP", + ".pl": "Prolog", + ".py": "Python 3", + ".pyc": "Python 3", + ".rb": "Ruby", + ".rkt": "Racket", + ".rs": "Rust", + ".scala": "Scala", + ".f90": "Fortran", + ".f": "Fortran", + ".for": "Fortran", + ".sh": "Bash", + ".apl": "APL", + ".ss": "Gerbil", + ".jl": "Julia", + ".vb": "Visual Basic", + ".dart": "Dart", + ".zig": "Zig", + ".swift": "Swift", + ".nim": "Nim", + ".lua": "Lua", + ".pm": "Perl", + ".sno": "SNOBOL", + ".odin": "Odin", + ".a68": "Algol 68", + ".cr": "Crystal", + ".sim": "Simula 67", + ".d": "D", + ".mod": "Modula-2", + ".st": "Smalltalk", + ".adb": "Ada", + ".ads": "Ada", + ".erl": "Erlang", + ".ex": "Elixir", +} + +_GUESS_MAINCLASS = {"Elixir", "Erlang", "Java", "Kotlin", "Modula-2", "Scala"} +_GUESS_MAINFILE = { + "Ada", + "Algol 68", + "APL", + "Bash", + "Crystal", + "Dart", + "Forth", + "Gerbil", + "JavaScript (Node.js)", + "JavaScript (SpiderMonkey)", + "Julia", + "Common Lisp", + "Lua", + "Nim", + "Octave", + "Pascal", + "Perl", + "PHP", + "Python 2", + "Python 3", + "Racket", + "Ruby", + "Rust", + "Simula", + "Smalltalk", + "SNOBOL", + "TypeScript", + "Zig", +} + +_HEADERS = {"User-Agent": "kattis-cli-submit"} + +_RUNNING_STATUS = 5 +_COMPILE_ERROR_STATUS = 8 +_ACCEPTED_STATUS = 16 +_STATUS_MAP = { + 0: "New", # + 1: "New", + 2: "Waiting for compile", + 3: "Compiling", + 4: "Waiting for run", + _RUNNING_STATUS: "Running", + 6: "Judge Error", + # 7: '', + _COMPILE_ERROR_STATUS: "Compile Error", + 9: "Run Time Error", + 10: "Memory Limit Exceeded", + 11: "Output Limit Exceeded", + 12: "Time Limit Exceeded", + 13: "Illegal Function", + 14: "Wrong Answer", + # 15: '', + _ACCEPTED_STATUS: "Accepted", +} + + +class ConfigError(Exception): + pass + + +def get_url(cfg, option, default): + if cfg.has_option("kattis", option): + return cfg.get("kattis", option) + else: + hostname = cfg.get("kattis", "hostname") + return f"https://{hostname}/{default}" + + +def get_config(): + """Returns a ConfigParser object for the .kattisrc file(s)""" + cfg = configparser.ConfigParser() + if os.path.exists(_DEFAULT_CONFIG): + cfg.read(_DEFAULT_CONFIG) + + try: + file = __file__ + except NameError: + file = sys.argv[0] + + if not cfg.read( + [ + os.path.join(os.path.expanduser("~"), ".kattisrc"), + os.path.join(os.path.dirname(file), ".kattisrc"), + os.path.join(os.path.dirname(os.path.realpath(file)), ".kattisrc"), + ] + ): + raise ConfigError( + """\ +I failed to read in a config file from your home directory or from the +same directory as this script. To download a .kattisrc file please visit +https:///download/kattisrc + +The file should look something like this: +[user] +username: yourusername +token: ********* + +[kattis] +hostname: +loginurl: https:///login +submissionurl: https:///submit +submissionsurl: https:///submissions""" + ) + return cfg + + +def is_python2(files): + python2 = re.compile(r"^\s*\bprint\b *[^ \(\),\]]|\braw_input\b") + for filename in files: + try: + with open(filename) as f: + for index, line in enumerate(f): + if index == 0 and line.startswith("#!"): + if "python2" in line: + return True + if "python3" in line: + return False + if python2.search(line.split("#")[0]): + return True + except UnicodeDecodeError: + pass + except IOError: + return False + return False + + +def guess_language(ext, files): + if ext == ".C": + return "C++" + ext = ext.lower() + if ext == ".h": + if any(f.endswith(".c") for f in files): + return "C" + else: + return "C++" + if ext == ".py": + if is_python2(files): + return "Python 2" + else: + return "Python 3" + return _LANGUAGE_GUESS.get(ext, None) + + +def guess_mainfile(language, files): + for filename in files: + if os.path.splitext(os.path.basename(filename))[0] in ["main", "Main"]: + return filename + for filename in files: + try: + with open(filename) as f: + conts = f.read() + if language in ["Java", "Rust", "Scala", "Kotlin"] and re.search( + r" main\s*\(", conts + ): + return filename + if language == "Pascal" and re.match(r"^\s*[Pp]rogram\b", conts): + return filename + except UnicodeDecodeError: + pass + except IOError: + pass + return files[0] + + +def guess_mainclass(language, files): + if language in _GUESS_MAINFILE and len(files) > 1: + return os.path.basename(guess_mainfile(language, files)) + if language in _GUESS_MAINCLASS: + mainfile = os.path.basename(guess_mainfile(language, files)) + name = os.path.splitext(mainfile)[0] + if language == "Kotlin": + return name[0].upper() + name[1:] + "Kt" + return name + return None + + +def login(login_url, username, password=None, token=None): + """Log in to Kattis. + + At least one of password or token needs to be provided. + + Returns a requests.Response with cookies needed to be able to submit + """ + login_args = {"user": username, "script": "true"} + if password: + login_args["password"] = password + if token: + login_args["token"] = token + + return requests.post(login_url, data=login_args, headers=_HEADERS) + + +def login_from_config(cfg): + """Log in to Kattis using the access information in a kattisrc file + + Returns a requests.Response with cookies needed to be able to submit + """ + username = cfg.get("user", "username") + password = token = None + try: + password = cfg.get("user", "password") + except configparser.NoOptionError: + pass + try: + token = cfg.get("user", "token") + except configparser.NoOptionError: + pass + if password is None and token is None: + raise ConfigError( + """\ +Your .kattisrc file appears corrupted. It must provide a token (or a +KATTIS password). + +Please download a new .kattisrc file""" + ) + + loginurl = get_url(cfg, "loginurl", "login") + return login(loginurl, username, password, token) + + +def submit( + submit_url, + cookies, + problem, + language, + files, + mainclass="", + tag="", + assignment=None, + contest=None, +): + """Make a submission. + + The url_opener argument is an OpenerDirector object to use (as + returned by the login() function) + + Returns the requests.Result from the submission + """ + + data = { + "submit": "true", + "submit_ctr": 2, + "language": language, + "mainclass": mainclass, + "problem": problem, + "tag": tag, + "script": "true", + } + + if assignment is not None: + data["assignment"] = assignment + if contest is not None: + data["contest"] = contest + sub_files = [] + for f in files: + with open(f, "rb") as sub_file: + sub_files.append( + ("sub_file[]", (os.path.basename(f), sub_file.read(), "application/octet-stream")) + ) + + return requests.post(submit_url, data=data, files=sub_files, cookies=cookies, headers=_HEADERS) + + +def confirm_or_die(problem, language, files, mainclass, tag): + print("Problem:", problem) + print("Language:", language) + print("Files:", ", ".join(files)) + if mainclass: + if language in _GUESS_MAINFILE: + print("Main file:", mainclass) + else: + print("Mainclass:", mainclass) + if tag: + print("Tag:", tag) + print("Submit (y/N)?") + if sys.stdin.readline().upper()[:-1] != "Y": + print("Cancelling") + sys.exit(1) + + +def get_submission_url(submit_response, cfg): + m = re.search(r"Submission ID: (\d+)", submit_response) + if m: + submissions_url = get_url(cfg, "submissionsurl", "submissions") + submission_id = m.group(1) + return f"{submissions_url}/{submission_id}" + + +def get_submission_status(submission_url, cookies): + reply = requests.get(submission_url + "?json", cookies=cookies, headers=_HEADERS) + return reply.json() + + +_RED_COLOR = 31 +_GREEN_COLOR = 32 +_YELLOW_COLOR = 33 + + +def color(s, c): + return f"\x1b[{c}m{s}\x1b[0m" + + +def show_judgement(submission_url, cfg): + login_reply = login_from_config(cfg) + while True: + status = get_submission_status(submission_url, login_reply.cookies) + status_id = status["status_id"] + testcases_done = status["testcase_index"] + testcases_total = status["row_html"].count(" _RUNNING_STATUS: + # Done + print() + success = status_id == _ACCEPTED_STATUS + try: + root = fragment_fromstring(status["row_html"], create_parent=True) + cpu_time = root.xpath('.//*[@data-type="cpu"]')[0].text_content() + try: + score = re.findall( + r"\(([\d\.]+)\)", root.xpath('.//*[@data-type="status"]')[0].text_content() + )[0] + except: + score = "" + status_text += ( + " (" + cpu_time + ", " + score + ")" if score else " (" + cpu_time + ")" + ) + except: + pass + if status_id != _COMPILE_ERROR_STATUS: + print(color(status_text, _GREEN_COLOR if success else _RED_COLOR)) + numerical_score = int(score) if score else 0 + return success, numerical_score, testcases_done, testcases_correct, testcases_total + + time.sleep(0.25) + + +def main(): + parser = argparse.ArgumentParser(prog="kattis", description="Submit a solution to Kattis") + group = parser.add_mutually_exclusive_group() + group.add_argument( + "-a", + "--assignment", + help="""Short name of assignment you want to submit to +Overrides default guess (server guesses based on assignments you are in)""", + ) + group.add_argument( + "-c", + "--contest", + help="""Short name of contest you want to submit to +Overrides default guess (server guesses based on contests you are in)""", + ) + parser.add_argument( + "-p", + "--problem", + help="""'Which problem to submit to. +Overrides default guess (first part of first filename)""", + ) + parser.add_argument( + "-m", + "--mainclass", + help="""Sets mainclass. +Overrides default guess (first part of first filename)""", + ) + parser.add_argument( + "-l", + "--language", + help="""Sets language. +Overrides default guess (based on suffix of first filename)""", + ) + parser.add_argument("-t", "--tag", help=argparse.SUPPRESS) + parser.add_argument( + "-f", "--force", help="Force, no confirmation prompt before submission", action="store_true" + ) + parser.add_argument("files", nargs="+") + + args = parser.parse_args() + files = args.files + + try: + cfg = get_config() + except ConfigError as exc: + print(exc) + sys.exit(1) + + problem, ext = os.path.splitext(os.path.basename(files[0])) + language = guess_language(ext, files) + mainclass = guess_mainclass(language, files) + tag = args.tag + + problem = problem.lower() + + if args.problem: + problem = args.problem + + if args.mainclass is not None: + mainclass = args.mainclass + + if args.language: + language = args.language + + if language is None: + print( + f'''\ +No language specified, and I failed to guess language from filename +extension "{ext}"''' + ) + sys.exit(1) + + files = sorted(list(set(args.files))) + + try: + login_reply = login_from_config(cfg) + except ConfigError as exc: + print(exc) + sys.exit(1) + except requests.exceptions.RequestException as err: + print("Login connection failed:", err) + sys.exit(1) + + if not login_reply.status_code == 200: + print("Login failed.") + if login_reply.status_code == 403: + print("Incorrect username or password/token (403)") + elif login_reply.status_code == 404: + print("Incorrect login URL (404)") + else: + print("Status code:", login_reply.status_code) + sys.exit(1) + + submit_url = get_url(cfg, "submissionurl", "submit") + + if not args.force: + confirm_or_die(problem, language, files, mainclass, tag) + + try: + result = submit( + submit_url, + login_reply.cookies, + problem, + language, + files, + mainclass, + tag, + args.assignment, + args.contest, + ) + except requests.exceptions.RequestException as err: + print("Submit connection failed:", err) + sys.exit(1) + + if result.status_code != 200: + print("Submission failed.") + if result.status_code == 403: + print("Access denied (403)") + elif result.status_code == 404: + print("Incorrect submit URL (404)") + else: + print("Status code:", result.status_code) + sys.exit(1) + + plain_result = result.content.decode("utf-8").replace("
", "\n") + print(plain_result) + + submission_url = None + try: + submission_url = get_submission_url(plain_result, cfg) + except configparser.NoOptionError: + pass + + if submission_url: + _, score, testcases_done, testcases_correct, testcases_total = show_judgement( + submission_url, cfg + ) + print(f"Score: {score}") + print(f"Test cases done: {testcases_done}") + print(f"Test cases correct: {testcases_correct}") + print(f"Test cases total: {testcases_total}") + + +if __name__ == "__main__": + main() diff --git a/openevolve/controller.py b/openevolve/controller.py index 8100277ed..3562a2b21 100644 --- a/openevolve/controller.py +++ b/openevolve/controller.py @@ -13,7 +13,7 @@ from openevolve.config import Config, load_config from openevolve.database import Program, ProgramDatabase -from openevolve.evaluator import Evaluator +from openevolve.evaluator import Evaluator, EvaluationObject from openevolve.llm.ensemble import LLMEnsemble from openevolve.prompt.sampler import PromptSampler from openevolve.process_parallel import ProcessParallelController @@ -74,6 +74,7 @@ def __init__( self, initial_program_path: str, evaluation_file: str, + evaluation_object: Optional[EvaluationObject] = None, config_path: Optional[str] = None, config: Optional[Config] = None, output_dir: Optional[str] = None, @@ -154,12 +155,14 @@ def __init__( self.evaluator = Evaluator( self.config.evaluator, evaluation_file, + evaluation_object, self.llm_evaluator_ensemble, self.evaluator_prompt_sampler, database=self.database, suffix=Path(self.initial_program_path).suffix, ) self.evaluation_file = evaluation_file + self.evaluation_object = self.evaluator.evaluation_object logger.info(f"Initialized OpenEvolve with {initial_program_path}") @@ -276,7 +279,7 @@ async def run( # Initialize improved parallel processing try: self.parallel_controller = ProcessParallelController( - self.config, self.evaluation_file, self.database + self.config, self.evaluation_file, self.evaluation_object, self.database ) # Set up signal handlers for graceful shutdown diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py index 0e25470b0..504776412 100644 --- a/openevolve/evaluator.py +++ b/openevolve/evaluator.py @@ -13,8 +13,9 @@ import time import traceback import uuid +import warnings from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union, Protocol, cast import traceback from openevolve.config import EvaluatorConfig @@ -29,6 +30,22 @@ logger = logging.getLogger(__name__) +class EvaluationObject(Protocol): + def evaluate(self, program_path: str) -> EvaluationResult: + ... + + +class CascadeEvaluationObject(Protocol): + def evaluate_stage1(self, program_path: str) -> EvaluationResult: + ... + + def evaluate_stage2(self, program_path: str) -> EvaluationResult: + ... + + def evaluate_stage3(self, program_path: str) -> EvaluationResult: + ... + + class Evaluator: """ Evaluates programs and assigns scores @@ -41,14 +58,18 @@ def __init__( self, config: EvaluatorConfig, evaluation_file: str, + evaluation_object: Optional[EvaluationObject] = None, llm_ensemble: Optional[LLMEnsemble] = None, prompt_sampler: Optional[PromptSampler] = None, database: Optional[ProgramDatabase] = None, suffix: Optional[str]=".py", ): + if evaluation_file and evaluation_object: + warnings.warn("Both evaluation_file and evaluation_object provided - evaluation_object overrides evaluation_file") self.config = config self.evaluation_file = evaluation_file self.program_suffix = suffix + self.evaluation_object = evaluation_object self.llm_ensemble = llm_ensemble self.prompt_sampler = prompt_sampler self.database = database @@ -56,8 +77,9 @@ def __init__( # Create a task pool for parallel evaluation self.task_pool = TaskPool(max_concurrency=config.parallel_evaluations) - # Set up evaluation function if file exists - self._load_evaluation_function() + if self.evaluation_object is None: + # Set up evaluation module if file exists + self._load_evaluation_function() # Pending artifacts storage for programs self._pending_artifacts: Dict[str, Dict[str, Union[str, bytes]]] = {} @@ -89,7 +111,7 @@ def _load_evaluation_function(self) -> None: f"Evaluation file {self.evaluation_file} does not contain an 'evaluate' function" ) - self.evaluate_function = module.evaluate + self.evaluation_object = module logger.info(f"Successfully loaded evaluation function from {self.evaluation_file}") # Validate cascade configuration @@ -348,7 +370,7 @@ async def _direct_evaluate( # Create a coroutine that runs the evaluation function in an executor async def run_evaluation(): loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, self.evaluate_function, program_path) + return await loop.run_in_executor(None, self.evaluation_object.evaluate, program_path) # Run the evaluation with timeout - let exceptions bubble up for retry handling result = await asyncio.wait_for(run_evaluation(), timeout=self.config.timeout) @@ -369,23 +391,11 @@ async def _cascade_evaluate( Returns: Dictionary of metrics or EvaluationResult with metrics and artifacts """ - # Import the evaluation module to get cascade functions if they exist + # This cast just makes static type checkers happy; actual checking is still done using hasattr + evaluation_object = cast(CascadeEvaluationObject, self.evaluation_object) try: - # Add the evaluation file's directory to Python path so it can import local modules - eval_dir = os.path.dirname(os.path.abspath(self.evaluation_file)) - if eval_dir not in sys.path: - sys.path.insert(0, eval_dir) - logger.debug(f"Added {eval_dir} to Python path for cascade evaluation") - - spec = importlib.util.spec_from_file_location("evaluation_module", self.evaluation_file) - if spec is None or spec.loader is None: - return await self._direct_evaluate(program_path) - - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - # Check if cascade functions exist - if not hasattr(module, "evaluate_stage1"): + if not hasattr(evaluation_object, "evaluate_stage1"): return await self._direct_evaluate(program_path) # Run first stage with timeout @@ -393,7 +403,7 @@ async def _cascade_evaluate( async def run_stage1(): loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, module.evaluate_stage1, program_path) + return await loop.run_in_executor(None, evaluation_object.evaluate_stage1, program_path) stage1_result = await asyncio.wait_for(run_stage1(), timeout=self.config.timeout) stage1_eval_result = self._process_evaluation_result(stage1_result) @@ -426,7 +436,7 @@ async def run_stage1(): return stage1_eval_result # Check if second stage exists - if not hasattr(module, "evaluate_stage2"): + if not hasattr(evaluation_object, "evaluate_stage2"): return stage1_eval_result # Run second stage with timeout @@ -434,7 +444,7 @@ async def run_stage1(): async def run_stage2(): loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, module.evaluate_stage2, program_path) + return await loop.run_in_executor(None, evaluation_object.evaluate_stage2, program_path) stage2_result = await asyncio.wait_for(run_stage2(), timeout=self.config.timeout) stage2_eval_result = self._process_evaluation_result(stage2_result) @@ -488,7 +498,7 @@ async def run_stage2(): return merged_result # Check if third stage exists - if not hasattr(module, "evaluate_stage3"): + if not hasattr(evaluation_object, "evaluate_stage3"): return merged_result # Run third stage with timeout @@ -496,7 +506,7 @@ async def run_stage2(): async def run_stage3(): loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, module.evaluate_stage3, program_path) + return await loop.run_in_executor(None, evaluation_object.evaluate_stage3, program_path) stage3_result = await asyncio.wait_for(run_stage3(), timeout=self.config.timeout) stage3_eval_result = self._process_evaluation_result(stage3_result) diff --git a/openevolve/process_parallel.py b/openevolve/process_parallel.py index d2df82f6c..84fb4285e 100644 --- a/openevolve/process_parallel.py +++ b/openevolve/process_parallel.py @@ -15,6 +15,7 @@ from openevolve.config import Config from openevolve.database import Program, ProgramDatabase +from openevolve.evaluator import EvaluationObject logger = logging.getLogger(__name__) @@ -33,10 +34,11 @@ class SerializableResult: error: Optional[str] = None -def _worker_init(config_dict: dict, evaluation_file: str) -> None: +def _worker_init(config_dict: dict, evaluation_file: str, evaluation_object: EvaluationObject) -> None: """Initialize worker process with necessary components""" global _worker_config global _worker_evaluation_file + global _worker_evaluation_object global _worker_evaluator global _worker_llm_ensemble global _worker_prompt_sampler @@ -79,6 +81,7 @@ def _worker_init(config_dict: dict, evaluation_file: str) -> None: }, ) _worker_evaluation_file = evaluation_file + _worker_evaluation_object = evaluation_object # These will be lazily initialized on first use _worker_evaluator = None @@ -115,6 +118,7 @@ def _lazy_init_worker_components(): _worker_evaluator = Evaluator( _worker_config.evaluator, _worker_evaluation_file, + _worker_evaluation_object, evaluator_llm, evaluator_prompt, database=None, # No shared database in worker @@ -258,9 +262,10 @@ def _run_iteration_worker( class ProcessParallelController: """Controller for process-based parallel evolution""" - def __init__(self, config: Config, evaluation_file: str, database: ProgramDatabase): + def __init__(self, config: Config, evaluation_file: str, evaluation_object: Optional[EvaluationObject], database: ProgramDatabase): self.config = config self.evaluation_file = evaluation_file + self.evaluation_object = evaluation_object self.database = database self.executor: Optional[ProcessPoolExecutor] = None @@ -310,7 +315,7 @@ def start(self) -> None: self.executor = ProcessPoolExecutor( max_workers=self.num_workers, initializer=_worker_init, - initargs=(config_dict, self.evaluation_file), + initargs=(config_dict, self.evaluation_file, self.evaluation_object), ) logger.info(f"Started process pool with {self.num_workers} processes") @@ -514,16 +519,11 @@ async def run_evolution( # Check target score if target_score is not None and child_program.metrics: - numeric_metrics = [ - v for v in child_program.metrics.values() if isinstance(v, (int, float)) - ] - if numeric_metrics: - avg_score = sum(numeric_metrics) / len(numeric_metrics) - if avg_score >= target_score: - logger.info( - f"Target score {target_score} reached at iteration {completed_iteration}" - ) - break + if child_program.metrics["combined_score"] >= target_score: + logger.info( + f"Target score {target_score} reached at iteration {completed_iteration}" + ) + break except Exception as e: logger.error(f"Error processing result from iteration {completed_iteration}: {e}") diff --git a/tests/test_checkpoint_resume.py b/tests/test_checkpoint_resume.py index 0320ae289..dcc5fce7a 100644 --- a/tests/test_checkpoint_resume.py +++ b/tests/test_checkpoint_resume.py @@ -21,6 +21,8 @@ class MockEvaluator: """Mock evaluator for testing""" + evaluation_object = None + def __init__(self): self.call_count = 0 diff --git a/tests/test_process_parallel.py b/tests/test_process_parallel.py index f7965a2e4..821eecf44 100644 --- a/tests/test_process_parallel.py +++ b/tests/test_process_parallel.py @@ -64,7 +64,7 @@ def tearDown(self): def test_controller_initialization(self): """Test that controller initializes correctly""" - controller = ProcessParallelController(self.config, self.eval_file, self.database) + controller = ProcessParallelController(self.config, self.eval_file, None, self.database) self.assertEqual(controller.num_workers, 2) self.assertIsNone(controller.executor) @@ -72,7 +72,7 @@ def test_controller_initialization(self): def test_controller_start_stop(self): """Test starting and stopping the controller""" - controller = ProcessParallelController(self.config, self.eval_file, self.database) + controller = ProcessParallelController(self.config, self.eval_file, None, self.database) # Start controller controller.start() @@ -85,7 +85,7 @@ def test_controller_start_stop(self): def test_database_snapshot_creation(self): """Test creating database snapshot for workers""" - controller = ProcessParallelController(self.config, self.eval_file, self.database) + controller = ProcessParallelController(self.config, self.eval_file, None, self.database) snapshot = controller._create_database_snapshot() @@ -106,7 +106,7 @@ def test_run_evolution_basic(self): """Test basic evolution run""" async def run_test(): - controller = ProcessParallelController(self.config, self.eval_file, self.database) + controller = ProcessParallelController(self.config, self.eval_file, None, self.database) # Mock the executor to avoid actually spawning processes with patch.object(controller, "_submit_iteration") as mock_submit: @@ -152,7 +152,7 @@ async def run_test(): def test_request_shutdown(self): """Test graceful shutdown request""" - controller = ProcessParallelController(self.config, self.eval_file, self.database) + controller = ProcessParallelController(self.config, self.eval_file, None, self.database) # Request shutdown controller.request_shutdown()