Add eval script for AIME 2024

codelion · codelion · commit 0e542431f017 · 2024-11-15T01:48:11.000+08:00
diff --git a/README.md b/README.md
@@ -191,8 +191,8 @@ response = client.chat.completions.create(
   - e.g. for llama.cpp, run `python3 optillm.py --base_url http://localhost:8080/v1`
 
 > [!WARNING]
-> Note that llama-server (and ollama) currently does not support sampling multiple responses from a model, which limits the available approaches to the following:
-> `cot_reflection`, `leap`, `plansearch`, `rstar`, `rto`, `self_consistency`, `re2`, and `z3`. Use the built-in local inference server to use these approaches.
+> Note that the Anthropic API, llama-server (and ollama) currently does not support sampling multiple responses from a model, which limits the available approaches to the following:
+> `cot_reflection`, `leap`, `plansearch`, `rstar`, `rto`, `self_consistency`, `re2`, and `z3`. For models on HuggingFace, you can use the built-in local inference server as it supports multiple responses.
 
 ## Implemented techniques
 
diff --git a/optillm/plugins/executecode_plugin.py b/optillm/plugins/executecode_plugin.py
@@ -8,6 +8,14 @@
 
 SLUG = "executecode"
 
+EXECUTE_CODE_PROMPT = '''Generate Python code to solve this problem. Put the code in a ```python block. The code:
+1. Should use standard Python libraries (math, itertools, etc.)
+2. Should print the final answer
+3. Should be complete and runnable
+4. Should include example test cases if relevant
+
+The code will be automatically executed when submitted.'''
+
 def extract_python_code(text: str) -> List[str]:
     """Extract Python code blocks from text."""
     # print(f"Extracting code: {text}")
@@ -78,7 +86,7 @@ def run(system_prompt: str, initial_query: str, client, model: str) -> Tuple[str
     else:
         # Get initial response from the model
         messages = [
-            {"role": "system", "content": system_prompt},
+            {"role": "system", "content": system_prompt + EXECUTE_CODE_PROMPT} ,
             {"role": "user", "content": initial_query}
         ]
         
diff --git a/scripts/eval_aime_benchmark.py b/scripts/eval_aime_benchmark.py
@@ -0,0 +1,238 @@
+import argparse
+import json
+import os
+import logging
+import re
+from typing import List, Dict, Tuple, Optional
+from datetime import datetime
+
+from openai import OpenAI
+from datasets import load_dataset
+from tqdm import tqdm
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Initialize OpenAI client
+client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),  base_url="http://localhost:8000/v1")
+
+SYSTEM_PROMPT = '''You are solving AIME (American Invitational Mathematics Examination) problems.
+
+Important: Always end your solution with the final answer in one of these two formats:
+
+1. \\[
+   \\boxed{X}.
+   \\]
+
+2. $n=\\boxed{X}$
+
+where X is your integer answer between 0 and 999.'''
+
+def load_2024_dataset() -> list[dict]:
+    """
+    Load the dataset of problems.
+    Returns:
+        list[dict]: The dataset of problems.
+    """
+    dataset_original = load_dataset("AI-MO/aimo-validation-aime")
+    # Filter out problems that are not from 2024
+    dataset = dataset_original["train"].filter(lambda example: "2024" in example["url"])
+    logging.debug(f"Filtered dataset size: {len(dataset)}.")
+    assert len(dataset) == 30, f"Expected 30 problems after filtering by 2024, but found {len(dataset)}"
+    return dataset
+
+def extract_answer(response: str) -> Optional[int]:
+    """
+    Extract the numerical answer from a math solution response.
+    Handles various formats of boxed answers and falls back to last number if needed.
+    
+    Args:
+        response (str): The complete response text from the model
+        
+    Returns:
+        Optional[int]: The extracted answer as an integer, or None if no valid answer found
+    """
+    if not response:
+        return None
+        
+    # Clean the response: normalize whitespace and handle potential Unicode
+    response = ' '.join(response.split())
+    
+    # List of regex patterns to try, in order of preference
+    patterns = [
+        # $n=\boxed{X}$ format
+        r'\$n=\\boxed{(\d+)}\$',
+        
+        # LaTeX display style answer: \[\boxed{X}\] or \[\boxed{X}.\]
+        r'\\\[\\boxed{(\d+)}\\\]',
+        r'\\\[\\boxed{(\d+)}\.\\\]',
+        
+        # Inline LaTeX \boxed{X}
+        r'\\boxed{(\d+)}',
+        
+        # Common variations
+        r'\$\\boxed{(\d+)}\$',
+        r'boxed{(\d+)}',
+        
+        # Less strict patterns
+        r'\\boxed\s*{\s*(\d+)\s*}',
+        r'\bboxed\s*{\s*(\d+)\s*}',
+        
+        # Plain text answer indicators
+        r'final answer is[^\d]*(\d+)',
+        r'answer is[^\d]*(\d+)',
+        r'answer:[^\d]*(\d+)',
+        r'= ?(\d+)$'
+    ]
+    
+    # Try each pattern in order
+    for pattern in patterns:
+        matches = re.finditer(pattern, response, re.IGNORECASE)
+        # Get the last match for this pattern (in case there are multiple)
+        last_match = None
+        for match in matches:
+            last_match = match
+            
+        if last_match:
+            try:
+                return int(last_match.group(1))
+            except (ValueError, IndexError):
+                continue
+    
+    # Fallback: Extract all numbers and take the last one
+    # This is our last resort, assuming the answer typically comes last
+    numbers = re.findall(r'(\d+)', response)
+    if numbers:
+        try:
+            # Convert to int and return the last number found
+            return int(numbers[-1])
+        except ValueError:
+            pass
+            
+    # If all methods fail, return None
+    return None
+
+def get_llm_response(problem: str, model: str) -> str:
+    """
+    Get response from the LLM for a given problem.
+    """
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": problem}
+            ],
+            max_tokens=8192,
+            # extra_body={
+            #     "decoding": "entropy_decoding",
+            # }
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        logger.error(f"Error getting LLM response: {e}")
+        return ""
+
+def evaluate_response(predicted_answer: Optional[int], correct_answer: int) -> bool:
+    """
+    Evaluate if the predicted answer matches the correct answer.
+    """
+    if predicted_answer is None:
+        return False
+    return predicted_answer == correct_answer
+
+def load_existing_results(filename: str) -> List[Dict]:
+    """Load existing results from file if it exists."""
+    try:
+        with open(filename, 'r') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        return []
+
+def save_result(filename: str, result: Dict):
+    """Save a single result to the results file."""
+    results = load_existing_results(filename)
+    results.append(result)
+    with open(filename, 'w') as f:
+        json.dump(results, f, indent=2)
+
+def get_last_processed_index(results: List[Dict]) -> int:
+    """Get the index of the last processed problem."""
+    if not results:
+        return -1
+    return max(int(r.get('index', -1)) for r in results)
+
+def analyze_results(results: List[Dict]):
+    """Analyze and print summary statistics of the results."""
+    total = len(results)
+    correct = sum(1 for r in results if r['is_correct'])
+    accuracy = correct / total if total > 0 else 0
+    
+    print("\n=== Results Summary ===")
+    print(f"Total problems: {total}")
+    print(f"Correct answers: {correct}")
+    print(f"Accuracy: {accuracy:.2%}")
+    
+    # Print incorrect problems for analysis
+    print("\n=== Incorrect Answers ===")
+    for r in results:
+        if not r['is_correct']:
+            print(f"Problem {r['index']}:")
+            print(f"Expected: {r['correct_answer']}")
+            print(f"Predicted: {r['predicted_answer']}")
+            print("---")
+
+def main(model: str):
+    """Main evaluation function."""
+    # Create results directory if it doesn't exist
+    os.makedirs("results", exist_ok=True)
+    
+    # Setup results file
+    results_file = f"evaluation_results_{model.replace('/', '_')}.json"
+    
+    # Load dataset
+    dataset = load_2024_dataset()
+    
+    # Load existing results
+    existing_results = load_existing_results(results_file)
+    last_processed_index = get_last_processed_index(existing_results)
+    
+    # Process problems
+    for idx, item in enumerate(tqdm(dataset, desc="Evaluating problems")):
+        if idx <= last_processed_index:
+            continue
+            
+        problem_text = item['problem']
+        correct_answer = int(item['answer'])
+        
+        # Get model response
+        response = get_llm_response(problem_text, model)
+        logger.debug(f"Response: {response}")
+        predicted_answer = extract_answer(response)
+        is_correct = evaluate_response(predicted_answer, correct_answer)
+        
+        # Save result
+        result = {
+            "index": idx,
+            "problem": problem_text,
+            "model_response": response,
+            "predicted_answer": predicted_answer,
+            "correct_answer": correct_answer,
+            "is_correct": is_correct
+        }
+        save_result(results_file, result)
+        
+        # Optional: Add delay between requests if needed
+        # time.sleep(5)
+    
+    # Analyze results
+    final_results = load_existing_results(results_file)
+    analyze_results(final_results)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluate LLM performance on AIME 2024 problems")
+    parser.add_argument("--model", type=str, required=True, help="OpenAI model to use (e.g., gpt-4, gpt-3.5-turbo)")
+    args = parser.parse_args()
+    
+    main(args.model)