algorithmicsuperintelligence
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 1 deletion b/‎.gitignore‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 6 additions & 1 deletion b/‎Makefile‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎configs/default_config.yaml‎
Lines changed: 16 additions & 7 deletions b/‎configs/default_config.yaml‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎examples/function_minimization/evaluator.py‎
Lines changed: 42 additions & 37 deletions b/‎examples/function_minimization/evaluator.py‎
Lines changed: 42 additions & 37 deletions
diff --git a/‎examples/function_minimization/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎examples/function_minimization/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/lm_eval/README.md‎
Lines changed: 78 additions & 0 deletions b/‎examples/lm_eval/README.md‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎examples/lm_eval/config.yml‎
Lines changed: 48 additions & 0 deletions b/‎examples/lm_eval/config.yml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎examples/lm_eval/evaluator_stub.py‎
Lines changed: 6 additions & 0 deletions b/‎examples/lm_eval/evaluator_stub.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/lm_eval/initial_content_stub.txt‎
Lines changed: 1 addition & 0 deletions b/‎examples/lm_eval/initial_content_stub.txt‎
Lines changed: 1 addition & 0 deletions
@@ -1,3 +1,7 @@
+results/
+examples/lm_eval/prompts/system_message.txt
+examples/lm_eval/prompts/evaluator_system_message.txt
+
 # Python
 __pycache__/
 *.py[cod]
@@ -48,4 +52,4 @@ htmlcov/
 
 # For SR
 secrets.yaml
-problems
+problems
@@ -48,4 +48,9 @@ docker-build:
 # Run the Docker container with the example
 .PHONY: docker-run
 docker-run:
-	docker run --rm -v $(PROJECT_DIR):/app $(DOCKER_IMAGE) examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
+	docker run --rm -v $(PROJECT_DIR):/app --network="host" $(DOCKER_IMAGE) examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
+
+# Run the lm-eval benchmark
+.PHONY: lm-eval
+lm-eval:
+	$(PYTHON) scripts/lm_eval/lm-eval.py
@@ -133,7 +133,7 @@ cat checkpoints/checkpoint_*/best_program_info.json | grep -A 10 metrics
 You can also install and execute via Docker:
 ```bash
 docker build -t openevolve .
-docker run --rm -v $(pwd):/app openevolve examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
+docker run --rm -v $(pwd):/app --network="host" openevolve examples/function_minimization/initial_program.py examples/function_minimization/evaluator.py --config examples/function_minimization/config.yaml --iterations 1000
 ```
 
 ## Configuration
 
@@ -16,13 +16,21 @@ max_code_length: 10000                # Maximum allowed code length in character
 
 # LLM configuration
 llm:
-  # Primary model (used most frequently)
-  primary_model: "gemini-2.0-flash-lite"
-  primary_model_weight: 0.8           # Sampling weight for primary model
-
-  # Secondary model (used for occasional high-quality generations)
-  secondary_model: "gemini-2.0-flash"
-  secondary_model_weight: 0.2         # Sampling weight for secondary model
+  # Models for evolution
+  models:
+    # List of available models with their weights
+    - name: "gemini-2.0-flash-lite"
+      weight: 0.8
+    - name: "gemini-2.0-flash"
+      weight: 0.2
+
+  # Models for LLM feedback
+  evaluator_models:
+    # List of available models with their weights
+    - name: "gemini-2.0-flash-lite"
+      weight: 0.8
+    - name: "gemini-2.0-flash"
+      weight: 0.2
 
   # API configuration
   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"  # Base URL for API (change for non-OpenAI models)
@@ -42,6 +50,7 @@ llm:
 prompt:
   template_dir: null                  # Custom directory for prompt templates
   system_message: "You are an expert coder helping to improve programs through evolution."
+  evaluator_system_message: "You are an expert code reviewer."
 
   # Number of examples to include in the prompt
   num_top_programs: 3                 # Number of top-performing programs to include
 
@@ -5,8 +5,9 @@
 import importlib.util
 import numpy as np
 import time
-import multiprocessing
+import concurrent.futures
 import traceback
+import signal
 
 
 def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5):
@@ -22,31 +23,13 @@ def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5):
     Returns:
         Result of the function or raises TimeoutError
     """
-
-    def wrapper(queue, func, args, kwargs):
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(func, *args, **kwargs)
         try:
-            result = func(*args, **kwargs)
-            queue.put(("success", result))
-        except Exception as e:
-            queue.put(("error", e))
-
-    queue = multiprocessing.Queue()
-    process = multiprocessing.Process(target=wrapper, args=(queue, func, args, kwargs))
-    process.start()
-    process.join(timeout=timeout_seconds)
-
-    if process.is_alive():
-        process.terminate()
-        process.join()
-        raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
-
-    if queue.empty():
-        raise TimeoutError("Function ended without returning a result")
-
-    status, result = queue.get()
-    if status == "error":
-        raise result
-    return result
+            result = future.result(timeout=timeout_seconds)
+            return result
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
 
 
 def safe_float(value):
@@ -107,15 +90,27 @@ def evaluate(program_path):
                 # Run with timeout
                 result = run_with_timeout(program.run_search, timeout_seconds=5)
 
-                # Check if we got a tuple of 3 values
-                if not isinstance(result, tuple) or len(result) != 3:
+                # Handle different result formats
+                if isinstance(result, tuple):
+                    if len(result) == 3:
+                        x, y, value = result
+                    elif len(result) == 2:
+                        # Assume it's (x, y) and calculate value
+                        x, y = result
+                        # Calculate the function value since it wasn't returned
+                        value = np.sin(x) * np.cos(y) + np.sin(x * y) + (x**2 + y**2) / 20
+                        print(f"Trial {trial}: Got 2 values, calculated function value: {value}")
+                    else:
+                        print(
+                            f"Trial {trial}: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
+                        )
+                        continue
+                else:
                     print(
-                        f"Trial {trial}: Invalid result format, expected tuple of 3 values but got {type(result)}"
+                        f"Trial {trial}: Invalid result format, expected tuple but got {type(result)}"
                     )
                     continue
 
-                x, y, value = result
-
                 end_time = time.time()
 
                 # Ensure all values are float
@@ -264,15 +259,25 @@ def evaluate_stage1(program_path):
             # Run a single trial with timeout
             result = run_with_timeout(program.run_search, timeout_seconds=5)
 
-            # Check if we got a tuple of 3 values
-            if not isinstance(result, tuple) or len(result) != 3:
-                print(
-                    f"Stage 1: Invalid result format, expected tuple of 3 values but got {type(result)}"
-                )
+            # Handle different result formats
+            if isinstance(result, tuple):
+                if len(result) == 3:
+                    x, y, value = result
+                elif len(result) == 2:
+                    # Assume it's (x, y) and calculate value
+                    x, y = result
+                    # Calculate the function value since it wasn't returned
+                    value = np.sin(x) * np.cos(y) + np.sin(x * y) + (x**2 + y**2) / 20
+                    print(f"Stage 1: Got 2 values, calculated function value: {value}")
+                else:
+                    print(
+                        f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
+                    )
+                    return {"runs_successfully": 0.0, "error": "Invalid result format"}
+            else:
+                print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}")
                 return {"runs_successfully": 0.0, "error": "Invalid result format"}
 
-            x, y, value = result
-
             # Ensure all values are float
             x = safe_float(x)
             y = safe_float(y)
 
@@ -0,0 +1 @@
+scipy
@@ -0,0 +1,78 @@
+# lm-eval.py
+
+`lm-eval.py` provides basic benchmark capability for LLM feedback-based evolutionary task solving. The benchmark framework is [EleutherAI's lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
+
+*Limitation:* Only generation-only tasks such as gsm8k are supported. This is because tasks that require loglikelihood probabilities are not well applicable to agents.
+
+## Usage
+
+```bash
+$ python3 examples/lm_eval/lm-eval.py -h
+usage: lm-eval.py [-h] [--config CONFIG] [--init_file INIT_FILE] [--evaluator_file EVALUATOR_FILE] [--iterations ITERATIONS] [--limit LIMIT] [--tasks TASKS]
+                  [--output_path OUTPUT_PATH]
+
+OpenEvolve <-> lm-evaluation-harness adapter.
+
+options:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file
+  --init_file INIT_FILE
+                        initial content file
+  --evaluator_file EVALUATOR_FILE
+                        evaluator file
+  --iterations ITERATIONS
+                        number of iterations
+  --limit LIMIT         limit the number of examples per task that are executed
+  --tasks TASKS         list of tasks to evaluate
+  --output_path OUTPUT_PATH
+                        output path for results
+```
+
+Early examples that **were meant to** indicate that more evolution iterations improve task performance -- I suspect the prompting may not be ideal yet:
+```
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 1
+[..]
+Headline metrics:
+  gsm8k           exact_match,strict-match 80.000%
+[..]
+
+
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 3
+[..]
+Headline metrics:
+  gsm8k           exact_match,strict-match 90.000%
+[..]
+
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 10
+[..]
+Headline metrics:
+  gsm8k           exact_match,strict-match 80.000%
+[..]
+
+$ python3 examples/lm_eval/lm-eval.py --tasks gsm8k --limit 10 --iterations 15
+[..]
+Headline metrics:
+  gsm8k           exact_match,strict-match 70.000%
+[..]
+```
+
+## Warning
+
+- Be aware that this is an early implementation. No extensive benchmarks have been executed so far. With a limit to 10 tasks and 10 iterations, the benchmark is meaningless as is.
+- Use the --limit parameter only for tests, not for metric generation.
+- Do not cite the metrics that result from the script execution blindly without reviewing the solution first.
+
+## References
+
+```bibtex
+@misc{eval-harness,
+    author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
+    title        = {The Language Model Evaluation Harness},
+    month        = 07,
+    year         = 2024,
+    publisher    = {Zenodo},
+    version      = {v0.4.3},
+    doi          = {10.5281/zenodo.12608602},
+    url          = {https://zenodo.org/records/12608602}
+}
+```
@@ -0,0 +1,48 @@
+max_iterations: 1
+checkpoint_interval: 10
+log_level: "INFO"
+
+# LLM configuration
+llm:
+  primary_model: "gemma3:12b-it-qat"
+  #primary_model: "gpt-4o"
+  primary_model_weight: 0.8
+  secondary_model: "gemma3:12b-it-qat"
+  #secondary_model: "gpt-4.1"
+  secondary_model_weight: 0.2
+  # api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
+  # api_base: "https://api.openai.com/v1/"
+  api_base: "http://localhost:11434/v1/"
+  api_key: "ollama"
+  temperature: 0.7
+  top_p: 0.95
+  max_tokens: 4096
+
+# Prompt configuration
+prompt:
+  num_top_programs: 3
+  use_template_stochasticity: true
+  # System prompt is created dynamically during the benchmark in file system_message.txt!
+  template_dir: "examples/lm_eval/prompts"
+
+# Database configuration
+database:
+  population_size: 50
+  archive_size: 20
+  num_islands: 3
+  elite_selection_ratio: 0.2
+  exploitation_ratio: 0.7
+
+# Evaluator configuration
+evaluator:
+  timeout: 60
+  cascade_evaluation: false
+  cascade_thresholds: [0.5, 0.75]
+  parallel_evaluations: 4
+  use_llm_feedback: true
+  llm_feedback_weight: 1.0
+
+
+# Evolution settings
+diff_based_evolution: false
+allow_full_rewrites: true
@@ -0,0 +1,6 @@
+def evaluate_stage1(file_path):
+    return {"not_implemented": 0.0}
+
+
+def evaluate(file_path):
+    return evaluate_stage1(file_path)
@@ -0,0 +1 @@
+insert the answer to the task here!