Skip to content

Commit d795ac4

Browse files
committed
fixes
1 parent 3a0e57b commit d795ac4

File tree

4 files changed

+128
-103
lines changed

4 files changed

+128
-103
lines changed
Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,23 @@
11
# Configuration for function minimization example
2-
max_iterations: 100
3-
checkpoint_interval: 10
4-
log_level: "INFO"
2+
max_iterations: 50
3+
checkpoint_interval: 5
54

65
# LLM configuration
76
llm:
8-
# primary_model: "gemini-2.0-flash-lite"
9-
primary_model: "llama3.1-8b"
10-
primary_model_weight: 0.8
11-
# secondary_model: "gemini-2.0-flash"
12-
secondary_model: "llama-4-scout-17b-16e-instruct"
13-
secondary_model_weight: 0.2
14-
# api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
15-
api_base: "https://api.cerebras.ai/v1"
16-
temperature: 0.7
17-
top_p: 0.95
18-
max_tokens: 4096
7+
primary_model: "gemini-2.5-flash-lite"
8+
# primary_model: "llama3.1-8b"
9+
primary_model_weight: 0.9
10+
secondary_model: "gemini-2.5-flash"
11+
# secondary_model: "llama-4-scout-17b-16e-instruct"
12+
secondary_model_weight: 0.1
13+
api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
14+
# api_base: "https://api.cerebras.ai/v1"
15+
temperature: 0.4
16+
max_tokens: 4000
1917

2018
# Prompt configuration
2119
prompt:
2220
system_message: "You are an expert programmer specializing in optimization algorithms. Your task is to improve a function minimization algorithm to find the global minimum of a complex function with many local minima. The function is f(x, y) = sin(x) * cos(y) + sin(x*y) + (x^2 + y^2)/20. Focus on improving the search_algorithm function to reliably find the global minimum, escaping local minima that might trap simple algorithms."
23-
num_top_programs: 3
24-
use_template_stochasticity: true
2521

2622
# Database configuration
2723
database:
@@ -34,11 +30,9 @@ database:
3430
# Evaluator configuration
3531
evaluator:
3632
timeout: 60
37-
cascade_evaluation: true
3833
cascade_thresholds: [0.5, 0.75]
39-
parallel_evaluations: 4
40-
use_llm_feedback: false
34+
parallel_evaluations: 3
4135

4236
# Evolution settings
43-
diff_based_evolution: true
44-
allow_full_rewrites: false
37+
diff_based_evolution: false
38+
max_code_length: 20000

examples/function_minimization/evaluator.py

Lines changed: 70 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def evaluate(program_path):
6969
return {
7070
"value_score": 0.0,
7171
"distance_score": 0.0,
72-
"speed_score": 0.0,
72+
"reliability_score": 0.0,
7373
"combined_score": 0.0,
7474
"error": "Missing run_search function",
7575
}
@@ -162,7 +162,7 @@ def evaluate(program_path):
162162
return {
163163
"value_score": 0.0,
164164
"distance_score": 0.0,
165-
"speed_score": 0.0,
165+
"reliability_score": 0.0,
166166
"combined_score": 0.0,
167167
"error": "All trials failed",
168168
}
@@ -173,65 +173,40 @@ def evaluate(program_path):
173173
avg_time = float(np.mean(times)) if times else 1.0
174174

175175
# Convert to scores (higher is better)
176-
value_score = float(1.0 / (1.0 + abs(avg_value - GLOBAL_MIN_VALUE))) # Normalize and invert
176+
value_score = float(1.0 / (1.0 + abs(avg_value - GLOBAL_MIN_VALUE)))
177177
distance_score = float(1.0 / (1.0 + avg_distance))
178-
speed_score = float(1.0 / avg_time) if avg_time > 0 else 0.0
179-
180-
# calculate standard deviation scores
181-
# get x_std_score
182-
x_std_score = float(1.0 / (1.0 + np.std(x_values)))
183-
# get y_std_score
184-
y_std_score = float(1.0 / (1.0 + np.std(y_values)))
185-
standard_deviation_score = (x_std_score + y_std_score) / 2.0
186-
187-
# Normalize speed score (so it doesn't dominate)
188-
speed_score = float(min(speed_score, 10.0) / 10.0)
189-
178+
190179
# Add reliability score based on success rate
191180
reliability_score = float(success_count / num_trials)
192181

193-
# Calculate a single combined score that prioritizes finding good solutions
194-
# over secondary metrics like speed and reliability
195-
# Value and distance scores (quality of solution) get 90% of the weight
196-
# Speed and reliability get only 10% combined
197-
combined_score = float(
198-
0.35 * value_score
199-
+ 0.35 * distance_score
200-
+ standard_deviation_score * 0.20
201-
+ 0.05 * speed_score
202-
+ 0.05 * reliability_score
203-
)
204-
205-
# Also compute an "overall" score that will be the primary metric for selection
206-
# This adds a bonus for finding solutions close to the global minimum
207-
# and heavily penalizes solutions that aren't finding the right region
208-
if distance_to_global < 1.0: # Very close to the correct solution
209-
solution_quality = 1.0
210-
elif distance_to_global < 3.0: # In the right region
211-
solution_quality = 0.5
182+
# Calculate solution quality based on distance to global minimum
183+
if avg_distance < 0.5: # Very close to the correct solution
184+
solution_quality_multiplier = 1.5 # 50% bonus
185+
elif avg_distance < 1.5: # In the right region
186+
solution_quality_multiplier = 1.2 # 20% bonus
187+
elif avg_distance < 3.0: # Getting closer
188+
solution_quality_multiplier = 1.0 # No adjustment
212189
else: # Not finding the right region
213-
solution_quality = 0.1
190+
solution_quality_multiplier = 0.7 # 30% penalty
214191

215-
# Overall score is dominated by solution quality but also factors in the combined score
216-
overall_score = 0.8 * solution_quality + 0.2 * combined_score
192+
# Calculate combined score that prioritizes finding the global minimum
193+
# Base score from value and distance, then apply solution quality multiplier
194+
base_score = 0.5 * value_score + 0.3 * distance_score + 0.2 * reliability_score
195+
combined_score = float(base_score * solution_quality_multiplier)
217196

218197
return {
219198
"value_score": value_score,
220199
"distance_score": distance_score,
221-
"standard_deviation_score": standard_deviation_score,
222-
"speed_score": speed_score,
223200
"reliability_score": reliability_score,
224201
"combined_score": combined_score,
225-
"overall_score": overall_score, # This will be the primary selection metric
226-
"success_rate": reliability_score,
227202
}
228203
except Exception as e:
229204
print(f"Evaluation failed completely: {str(e)}")
230205
print(traceback.format_exc())
231206
return {
232207
"value_score": 0.0,
233208
"distance_score": 0.0,
234-
"speed_score": 0.0,
209+
"reliability_score": 0.0,
235210
"combined_score": 0.0,
236211
"error": str(e),
237212
}
@@ -255,7 +230,11 @@ def evaluate_stage1(program_path):
255230
# Check if the required function exists
256231
if not hasattr(program, "run_search"):
257232
print(f"Stage 1 validation: Program does not have 'run_search' function")
258-
return {"runs_successfully": 0.0, "error": "Missing run_search function"}
233+
return {
234+
"runs_successfully": 0.0,
235+
"combined_score": 0.0,
236+
"error": "Missing run_search function"
237+
}
259238

260239
try:
261240
# Run a single trial with timeout
@@ -275,10 +254,18 @@ def evaluate_stage1(program_path):
275254
print(
276255
f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
277256
)
278-
return {"runs_successfully": 0.0, "error": "Invalid result format"}
257+
return {
258+
"runs_successfully": 0.0,
259+
"combined_score": 0.0,
260+
"error": "Invalid result format"
261+
}
279262
else:
280263
print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}")
281-
return {"runs_successfully": 0.0, "error": "Invalid result format"}
264+
return {
265+
"runs_successfully": 0.0,
266+
"combined_score": 0.0,
267+
"error": "Invalid result format"
268+
}
282269

283270
# Ensure all values are float
284271
x = safe_float(x)
@@ -295,7 +282,11 @@ def evaluate_stage1(program_path):
295282
or np.isinf(value)
296283
):
297284
print(f"Stage 1 validation: Invalid result, got x={x}, y={y}, value={value}")
298-
return {"runs_successfully": 0.5, "error": "Invalid result values"}
285+
return {
286+
"runs_successfully": 0.5,
287+
"combined_score": 0.0,
288+
"error": "Invalid result values"
289+
}
299290

300291
# Calculate distance safely
301292
x_diff = float(x) - GLOBAL_MIN_X
@@ -306,38 +297,59 @@ def evaluate_stage1(program_path):
306297
value_score = float(1.0 / (1.0 + abs(value - GLOBAL_MIN_VALUE)))
307298
distance_score = float(1.0 / (1.0 + distance))
308299

309-
# Calculate solution quality metric
310-
if distance < 1.0: # Very close to the correct solution
311-
solution_quality = 1.0
312-
elif distance < 3.0: # In the right region
313-
solution_quality = 0.5
300+
# Calculate solution quality based on distance to global minimum
301+
if distance < 0.5: # Very close to the correct solution
302+
solution_quality_multiplier = 1.4 # 40% bonus
303+
elif distance < 1.5: # In the right region
304+
solution_quality_multiplier = 1.15 # 15% bonus
305+
elif distance < 3.0: # Getting closer
306+
solution_quality_multiplier = 1.0 # No adjustment
314307
else: # Not finding the right region
315-
solution_quality = 0.1
308+
solution_quality_multiplier = 0.8 # 20% penalty
309+
310+
# Calculate combined score for stage 1
311+
base_score = 0.6 * value_score + 0.4 * distance_score
312+
combined_score = float(base_score * solution_quality_multiplier)
316313

317-
# Basic metrics with overall score
318314
return {
319315
"runs_successfully": 1.0,
320316
"value_score": value_score,
321317
"distance_score": distance_score,
322-
"overall_score": solution_quality, # This becomes a strong guiding metric
318+
"combined_score": combined_score,
323319
}
324320
except TimeoutError as e:
325321
print(f"Stage 1 evaluation timed out: {e}")
326-
return {"runs_successfully": 0.0, "error": "Timeout"}
322+
return {
323+
"runs_successfully": 0.0,
324+
"combined_score": 0.0,
325+
"error": "Timeout"
326+
}
327327
except IndexError as e:
328328
# Specifically handle IndexError which often happens with early termination checks
329329
print(f"Stage 1 evaluation failed with IndexError: {e}")
330330
print("This is likely due to a list index check before the list is fully populated.")
331-
return {"runs_successfully": 0.0, "error": f"IndexError: {str(e)}"}
331+
return {
332+
"runs_successfully": 0.0,
333+
"combined_score": 0.0,
334+
"error": f"IndexError: {str(e)}"
335+
}
332336
except Exception as e:
333337
print(f"Stage 1 evaluation failed: {e}")
334338
print(traceback.format_exc())
335-
return {"runs_successfully": 0.0, "error": str(e)}
339+
return {
340+
"runs_successfully": 0.0,
341+
"combined_score": 0.0,
342+
"error": str(e)
343+
}
336344

337345
except Exception as e:
338346
print(f"Stage 1 evaluation failed: {e}")
339347
print(traceback.format_exc())
340-
return {"runs_successfully": 0.0, "error": str(e)}
348+
return {
349+
"runs_successfully": 0.0,
350+
"combined_score": 0.0,
351+
"error": str(e)
352+
}
341353

342354

343355
def evaluate_stage2(program_path):

openevolve/config.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,7 @@ class LLMConfig(LLMModelConfig):
5656
retry_delay: int = 5
5757

5858
# n-model configuration for evolution LLM ensemble
59-
models: List[LLMModelConfig] = field(
60-
default_factory=lambda: [
61-
LLMModelConfig(name="gpt-4o-mini", weight=0.8),
62-
LLMModelConfig(name="gpt-4o", weight=0.2),
63-
]
64-
)
59+
models: List[LLMModelConfig] = field(default_factory=list)
6560

6661
# n-model configuration for evaluator LLM ensemble
6762
evaluator_models: List[LLMModelConfig] = field(default_factory=lambda: [])
@@ -75,24 +70,34 @@ class LLMConfig(LLMModelConfig):
7570
def __post_init__(self):
7671
"""Post-initialization to set up model configurations"""
7772
# Handle backward compatibility for primary_model(_weight) and secondary_model(_weight).
78-
if (self.primary_model or self.primary_model_weight) and len(self.models) < 1:
79-
# Ensure we have a primary model
80-
self.models.append(LLMModelConfig())
8173
if self.primary_model:
82-
self.models[0].name = self.primary_model
83-
if self.primary_model_weight:
84-
self.models[0].weight = self.primary_model_weight
74+
# Create primary model
75+
primary_model = LLMModelConfig(
76+
name=self.primary_model,
77+
weight=self.primary_model_weight or 1.0
78+
)
79+
self.models.append(primary_model)
8580

86-
if (self.secondary_model or self.secondary_model_weight) and len(self.models) < 2:
87-
# Ensure we have a second model
88-
self.models.append(LLMModelConfig())
8981
if self.secondary_model:
90-
self.models[1].name = self.secondary_model
91-
if self.secondary_model_weight:
92-
self.models[1].weight = self.secondary_model_weight
82+
# Create secondary model (only if weight > 0)
83+
if not self.secondary_model_weight or self.secondary_model_weight > 0:
84+
secondary_model = LLMModelConfig(
85+
name=self.secondary_model,
86+
weight=self.secondary_model_weight or 0.2
87+
)
88+
self.models.append(secondary_model)
89+
90+
# Only validate if this looks like a user config (has some model info)
91+
# Don't validate during internal/default initialization
92+
if (self.primary_model or self.secondary_model or
93+
self.primary_model_weight or self.secondary_model_weight) and not self.models:
94+
raise ValueError(
95+
"No LLM models configured. Please specify 'models' array or "
96+
"'primary_model' in your configuration."
97+
)
9398

9499
# If no evaluator models are defined, use the same models as for evolution
95-
if not self.evaluator_models or len(self.evaluator_models) < 1:
100+
if not self.evaluator_models:
96101
self.evaluator_models = self.models.copy()
97102

98103
# Update models with shared configuration values

openevolve/process_parallel.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,26 @@ def _run_iteration_worker(
178178
iteration_start = time.time()
179179

180180
# Generate code modification (sync wrapper for async)
181-
llm_response = asyncio.run(
182-
_worker_llm_ensemble.generate_with_context(
183-
system_message=prompt["system"],
184-
messages=[{"role": "user", "content": prompt["user"]}],
181+
try:
182+
llm_response = asyncio.run(
183+
_worker_llm_ensemble.generate_with_context(
184+
system_message=prompt["system"],
185+
messages=[{"role": "user", "content": prompt["user"]}],
186+
)
187+
)
188+
except Exception as e:
189+
logger.error(f"LLM generation failed: {e}")
190+
return SerializableResult(
191+
error=f"LLM generation failed: {str(e)}",
192+
iteration=iteration
193+
)
194+
195+
# Check for None response
196+
if llm_response is None:
197+
return SerializableResult(
198+
error="LLM returned None response",
199+
iteration=iteration
185200
)
186-
)
187201

188202
# Parse response based on evolution mode
189203
if _worker_config.diff_based_evolution:

0 commit comments

Comments
 (0)