Update evaluator threshold logic and config parameters

codelion · codelion · commit 65118b5e162d · 2025-07-30T22:38:16.000+08:00
Updated the evaluator to prioritize 'combined_score' when checking thresholds for consistency with evolution, falling back to averaging metrics if not present. Increased evaluator timeout and cascade threshold in the config, and switched to using max_tokens from config in prompt evaluation. Also updated the LLM model name in the config.
diff --git a/examples/llm_prompt_optimization/config.yaml b/examples/llm_prompt_optimization/config.yaml
@@ -13,7 +13,7 @@ language: "text"  # Explicitly set language to text for prompt evolution
 llm:
   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
   models:
-    - name: "gemini-2.5-flash-lite"  # Using Gemini 2.5 Flash Lite
+    - name: "gemini-2.5-flash"  # Using Gemini 2.5 Flash Lite
       weight: 1.0
   
   temperature: 0.4  # Optimal from experiments
@@ -67,8 +67,8 @@ database:
 
 # Evaluator Configuration
 evaluator:
-  timeout: 600
+  timeout: 1000
   max_retries: 3
   parallel_evaluations: 4
   cascade_evaluation: true  # Two-stage cascading evaluation
-  cascade_thresholds: [0.4]  # Stage 1 must achieve 90% accuracy to proceed to stage 2
+  cascade_thresholds: [0.8]  # Stage 1 must achieve 90% accuracy to proceed to stage 2
diff --git a/examples/llm_prompt_optimization/evaluator.py b/examples/llm_prompt_optimization/evaluator.py
@@ -32,6 +32,10 @@
 evaluator_config = config.get('evaluator', {})
 MAX_RETRIES = evaluator_config.get('max_retries', 3)
 
+# Get max_tokens from LLM config
+MAX_TOKENS = llm_config.get('max_tokens', 16000)
+print(f"Using max_tokens: {MAX_TOKENS}")
+
 # Initialize OpenAI client once for all evaluations
 test_model = OpenAI(base_url=api_base)
 print(f"Initialized OpenAI client with model: {TASK_MODEL_NAME}")
@@ -193,14 +197,12 @@ def evaluate_prompt(prompt, dataset, config, num_samples):
         # Call the LLM with retry logic
         for attempt in range(MAX_RETRIES):
             try:
-                # Adjust max_tokens based on task
-                max_tokens = 500 if is_gsm8k else 20
-                
+                # Use max_tokens from config
                 response = test_model.chat.completions.create(
                     model=TASK_MODEL_NAME,
                     messages=messages,
                     temperature=0.1,  # Low temperature for consistent results
-                    max_tokens=max_tokens
+                    max_tokens=MAX_TOKENS
                 )
                 break
             except Exception as e:
diff --git a/openevolve/evaluator.py b/openevolve/evaluator.py
@@ -644,6 +644,9 @@ def _create_cascade_error_context(self, stage: str, error: Exception) -> dict:
     def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool:
         """
         Check if metrics pass a threshold
+        
+        Uses 'combined_score' if available (for consistency with evolution),
+        otherwise falls back to averaging all numeric metrics except 'error'
 
         Args:
             metrics: Dictionary of metric name to score
@@ -655,7 +658,14 @@ def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool
         if not metrics:
             return False
 
-        # Calculate average score, skipping non-numeric values and 'error' key
+        # Use combined_score if available - this is what evolution uses
+        if "combined_score" in metrics:
+            score = metrics.get("combined_score")
+            if isinstance(score, (int, float)):
+                return float(score) >= threshold
+
+        # Fallback: average all numeric metrics except 'error'
+        # This maintains backward compatibility
         valid_metrics = []
         for name, value in metrics.items():
             # Skip 'error' keys and ensure values are numeric