Skip to content

Commit 65118b5

Browse files
committed
Update evaluator threshold logic and config parameters
Updated the evaluator to prioritize 'combined_score' when checking thresholds for consistency with evolution, falling back to averaging metrics if not present. Increased evaluator timeout and cascade threshold in the config, and switched to using max_tokens from config in prompt evaluation. Also updated the LLM model name in the config.
1 parent cd5ccef commit 65118b5

File tree

3 files changed

+20
-8
lines changed

3 files changed

+20
-8
lines changed

examples/llm_prompt_optimization/config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ language: "text" # Explicitly set language to text for prompt evolution
1313
llm:
1414
api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
1515
models:
16-
- name: "gemini-2.5-flash-lite" # Using Gemini 2.5 Flash Lite
16+
- name: "gemini-2.5-flash" # Using Gemini 2.5 Flash Lite
1717
weight: 1.0
1818

1919
temperature: 0.4 # Optimal from experiments
@@ -67,8 +67,8 @@ database:
6767

6868
# Evaluator Configuration
6969
evaluator:
70-
timeout: 600
70+
timeout: 1000
7171
max_retries: 3
7272
parallel_evaluations: 4
7373
cascade_evaluation: true # Two-stage cascading evaluation
74-
cascade_thresholds: [0.4] # Stage 1 must achieve 90% accuracy to proceed to stage 2
74+
cascade_thresholds: [0.8] # Stage 1 must achieve 90% accuracy to proceed to stage 2

examples/llm_prompt_optimization/evaluator.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
evaluator_config = config.get('evaluator', {})
3333
MAX_RETRIES = evaluator_config.get('max_retries', 3)
3434

35+
# Get max_tokens from LLM config
36+
MAX_TOKENS = llm_config.get('max_tokens', 16000)
37+
print(f"Using max_tokens: {MAX_TOKENS}")
38+
3539
# Initialize OpenAI client once for all evaluations
3640
test_model = OpenAI(base_url=api_base)
3741
print(f"Initialized OpenAI client with model: {TASK_MODEL_NAME}")
@@ -193,14 +197,12 @@ def evaluate_prompt(prompt, dataset, config, num_samples):
193197
# Call the LLM with retry logic
194198
for attempt in range(MAX_RETRIES):
195199
try:
196-
# Adjust max_tokens based on task
197-
max_tokens = 500 if is_gsm8k else 20
198-
200+
# Use max_tokens from config
199201
response = test_model.chat.completions.create(
200202
model=TASK_MODEL_NAME,
201203
messages=messages,
202204
temperature=0.1, # Low temperature for consistent results
203-
max_tokens=max_tokens
205+
max_tokens=MAX_TOKENS
204206
)
205207
break
206208
except Exception as e:

openevolve/evaluator.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,9 @@ def _create_cascade_error_context(self, stage: str, error: Exception) -> dict:
644644
def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool:
645645
"""
646646
Check if metrics pass a threshold
647+
648+
Uses 'combined_score' if available (for consistency with evolution),
649+
otherwise falls back to averaging all numeric metrics except 'error'
647650
648651
Args:
649652
metrics: Dictionary of metric name to score
@@ -655,7 +658,14 @@ def _passes_threshold(self, metrics: Dict[str, float], threshold: float) -> bool
655658
if not metrics:
656659
return False
657660

658-
# Calculate average score, skipping non-numeric values and 'error' key
661+
# Use combined_score if available - this is what evolution uses
662+
if "combined_score" in metrics:
663+
score = metrics.get("combined_score")
664+
if isinstance(score, (int, float)):
665+
return float(score) >= threshold
666+
667+
# Fallback: average all numeric metrics except 'error'
668+
# This maintains backward compatibility
659669
valid_metrics = []
660670
for name, value in metrics.items():
661671
# Skip 'error' keys and ensure values are numeric

0 commit comments

Comments
 (0)