Skip to content

Commit 1f08698

Browse files
committed
simplify metrics
1 parent 0ed87f9 commit 1f08698

File tree

4 files changed

+94
-27
lines changed

4 files changed

+94
-27
lines changed

examples/function_minimization/config.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@ log_level: "INFO"
55

66
# LLM configuration
77
llm:
8-
primary_model: "gemini-2.0-flash-lite"
8+
# primary_model: "gemini-2.0-flash-lite"
9+
primary_model: "llama3.1-8b"
910
primary_model_weight: 0.8
10-
secondary_model: "gemini-2.0-flash"
11+
# secondary_model: "gemini-2.0-flash"
12+
secondary_model: "llama-4-scout-17b-16e-instruct"
1113
secondary_model_weight: 0.2
12-
api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
14+
# api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
15+
api_base: "https://api.cerebras.ai/v1"
1316
temperature: 0.7
1417
top_p: 0.95
1518
max_tokens: 4096

examples/function_minimization/evaluator.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -154,15 +154,32 @@ def evaluate(program_path):
154154
# Add reliability score based on success rate
155155
reliability_score = float(success_count / num_trials)
156156

157-
# Calculate combined score
158-
combined_score = float(0.5 * value_score + 0.2 * distance_score + 0.1 * speed_score + 0.2 * reliability_score)
157+
# Calculate a single combined score that prioritizes finding good solutions
158+
# over secondary metrics like speed and reliability
159+
# Value and distance scores (quality of solution) get 90% of the weight
160+
# Speed and reliability get only 10% combined
161+
combined_score = float(0.6 * value_score + 0.3 * distance_score + 0.05 * speed_score + 0.05 * reliability_score)
162+
163+
# Also compute an "overall" score that will be the primary metric for selection
164+
# This adds a bonus for finding solutions close to the global minimum
165+
# and heavily penalizes solutions that aren't finding the right region
166+
if distance_to_global < 1.0: # Very close to the correct solution
167+
solution_quality = 1.0
168+
elif distance_to_global < 3.0: # In the right region
169+
solution_quality = 0.5
170+
else: # Not finding the right region
171+
solution_quality = 0.1
172+
173+
# Overall score is dominated by solution quality but also factors in the combined score
174+
overall_score = 0.8 * solution_quality + 0.2 * combined_score
159175

160176
return {
161177
"value_score": value_score,
162178
"distance_score": distance_score,
163179
"speed_score": speed_score,
164180
"reliability_score": reliability_score,
165181
"combined_score": combined_score,
182+
"overall_score": overall_score, # This will be the primary selection metric
166183
"success_rate": reliability_score
167184
}
168185
except Exception as e:
@@ -222,11 +239,26 @@ def evaluate_stage1(program_path):
222239
y_diff = float(y) - GLOBAL_MIN_Y
223240
distance = float(np.sqrt(x_diff**2 + y_diff**2))
224241

225-
# Basic metrics
242+
# Calculate value-based score
243+
value_score = float(1.0 / (1.0 + abs(value - GLOBAL_MIN_VALUE)))
244+
distance_score = float(1.0 / (1.0 + distance))
245+
246+
# Calculate solution quality metric
247+
if distance < 1.0: # Very close to the correct solution
248+
solution_quality = 1.0
249+
elif distance < 3.0: # In the right region
250+
solution_quality = 0.5
251+
else: # Not finding the right region
252+
solution_quality = 0.1
253+
254+
# Basic metrics with overall score
226255
return {
227256
"runs_successfully": 1.0,
228257
"value": float(value),
229-
"distance": distance
258+
"distance": distance,
259+
"value_score": value_score,
260+
"distance_score": distance_score,
261+
"overall_score": solution_quality # This becomes a strong guiding metric
230262
}
231263
except TimeoutError as e:
232264
print(f"Stage 1 evaluation timed out: {e}")

openevolve/controller.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,16 @@ async def run(
276276
if best_program is None:
277277
best_program = self.database.get_best_program()
278278
logger.info("Using calculated best program (tracked program not found)")
279+
280+
# Check if there's a better program by combined_score that wasn't tracked
281+
if "combined_score" in best_program.metrics:
282+
best_by_combined = self.database.get_best_program(metric="combined_score")
283+
if best_by_combined and best_by_combined.id != best_program.id and "combined_score" in best_by_combined.metrics:
284+
# If the combined_score of this program is significantly better, use it instead
285+
if best_by_combined.metrics["combined_score"] > best_program.metrics["combined_score"] + 0.02:
286+
logger.warning(f"Found program with better combined_score: {best_by_combined.id}")
287+
logger.warning(f"Score difference: {best_program.metrics['combined_score']:.4f} vs {best_by_combined.metrics['combined_score']:.4f}")
288+
best_program = best_by_combined
279289

280290
if best_program:
281291
logger.info(

openevolve/database.py

Lines changed: 42 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]:
159159
Get the best program based on a metric
160160
161161
Args:
162-
metric: Metric to use for ranking (uses average if None)
162+
metric: Metric to use for ranking (uses combined_score or average if None)
163163
164164
Returns:
165165
Best program or None if database is empty
@@ -169,6 +169,7 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]:
169169

170170
# If no specific metric and we have a tracked best program, return it
171171
if metric is None and self.best_program_id and self.best_program_id in self.programs:
172+
logger.debug(f"Using tracked best program: {self.best_program_id}")
172173
return self.programs[self.best_program_id]
173174

174175
if metric:
@@ -178,20 +179,40 @@ def get_best_program(self, metric: Optional[str] = None) -> Optional[Program]:
178179
key=lambda p: p.metrics[metric],
179180
reverse=True
180181
)
182+
if sorted_programs:
183+
logger.debug(f"Found best program by metric '{metric}': {sorted_programs[0].id}")
184+
elif self.programs and all("combined_score" in p.metrics for p in self.programs.values()):
185+
# Sort by combined_score if it exists (preferred method)
186+
sorted_programs = sorted(
187+
self.programs.values(),
188+
key=lambda p: p.metrics["combined_score"],
189+
reverse=True
190+
)
191+
if sorted_programs:
192+
logger.debug(f"Found best program by combined_score: {sorted_programs[0].id}")
181193
else:
182-
# Sort by average of all metrics
194+
# Sort by average of all metrics as fallback
183195
sorted_programs = sorted(
184196
self.programs.values(),
185197
key=lambda p: sum(p.metrics.values()) / max(1, len(p.metrics)),
186198
reverse=True
187199
)
200+
if sorted_programs:
201+
logger.debug(f"Found best program by average metrics: {sorted_programs[0].id}")
188202

189-
# Update the best program tracking if we found a better program
190-
if sorted_programs and (self.best_program_id is None or
191-
sorted_programs[0].id != self.best_program_id):
192-
old_id = self.best_program_id
193-
self.best_program_id = sorted_programs[0].id
194-
logger.info(f"Updated best program tracking: {self.best_program_id} ")
203+
# Update the best program tracking if we found a better program
204+
if sorted_programs and (self.best_program_id is None or
205+
sorted_programs[0].id != self.best_program_id):
206+
old_id = self.best_program_id
207+
self.best_program_id = sorted_programs[0].id
208+
logger.info(f"Updated best program tracking from {old_id} to {self.best_program_id}")
209+
210+
# Also log the scores to help understand the update
211+
if old_id and old_id in self.programs and "combined_score" in self.programs[old_id].metrics \
212+
and "combined_score" in self.programs[self.best_program_id].metrics:
213+
old_score = self.programs[old_id].metrics["combined_score"]
214+
new_score = self.programs[self.best_program_id].metrics["combined_score"]
215+
logger.info(f"Score change: {old_score:.4f}{new_score:.4f} ({new_score-old_score:+.4f})")
195216

196217
return sorted_programs[0] if sorted_programs else None
197218

@@ -416,7 +437,11 @@ def _is_better(self, program1: Program, program2: Program) -> bool:
416437
if not program1.metrics and program2.metrics:
417438
return False
418439

419-
# Compare average of metrics
440+
# Check for combined_score first (this is the preferred metric)
441+
if "combined_score" in program1.metrics and "combined_score" in program2.metrics:
442+
return program1.metrics["combined_score"] > program2.metrics["combined_score"]
443+
444+
# Fallback to average of all metrics
420445
avg1 = sum(program1.metrics.values()) / len(program1.metrics)
421446
avg2 = sum(program2.metrics.values()) / len(program2.metrics)
422447

@@ -466,18 +491,15 @@ def _update_best_program(self, program: Program) -> None:
466491
if self._is_better(program, current_best):
467492
old_id = self.best_program_id
468493
self.best_program_id = program.id
469-
logger.info(f"New best program {program.id} replaces {old_id}")
470494

471-
# Log improvement in metrics
472-
if program.metrics and current_best.metrics:
473-
improvements = []
474-
for metric, value in program.metrics.items():
475-
if metric in current_best.metrics:
476-
diff = value - current_best.metrics[metric]
477-
improvements.append(f"{metric}: {diff:+.4f}")
478-
479-
if improvements:
480-
logger.info(f"Metric improvements: {', '.join(improvements)}")
495+
# Log the change
496+
if "combined_score" in program.metrics and "combined_score" in current_best.metrics:
497+
old_score = current_best.metrics["combined_score"]
498+
new_score = program.metrics["combined_score"]
499+
score_diff = new_score - old_score
500+
logger.info(f"New best program {program.id} replaces {old_id} (combined_score: {old_score:.4f}{new_score:.4f}, +{score_diff:.4f})")
501+
else:
502+
logger.info(f"New best program {program.id} replaces {old_id}")
481503

482504
def _sample_parent(self) -> Program:
483505
"""

0 commit comments

Comments
 (0)