Skip to content

Commit 78e153b

Browse files
committed
f
1 parent fe7f5ec commit 78e153b

File tree

2 files changed

+198
-63
lines changed

2 files changed

+198
-63
lines changed

examples/function_minimization/config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ llm:
1212
secondary_model_weight: 0.2
1313
api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
1414
# api_base: "https://api.cerebras.ai/v1"
15-
temperature: 0.6
16-
max_tokens: 10000
15+
temperature: 0.7
16+
max_tokens: 16000
1717
timeout: 120
1818

1919
# Prompt configuration
@@ -31,7 +31,7 @@ database:
3131
# Evaluator configuration
3232
evaluator:
3333
timeout: 60
34-
cascade_thresholds: [1.45]
34+
cascade_thresholds: [1.4]
3535
parallel_evaluations: 3
3636

3737
# Evolution settings

examples/function_minimization/evaluator.py

Lines changed: 195 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import concurrent.futures
99
import traceback
1010
import signal
11+
from openevolve.evaluation_result import EvaluationResult
1112

1213

1314
def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5):
@@ -66,13 +67,23 @@ def evaluate(program_path):
6667
# Check if the required function exists
6768
if not hasattr(program, "run_search"):
6869
print(f"Error: program does not have 'run_search' function")
69-
return {
70-
"value_score": 0.0,
71-
"distance_score": 0.0,
72-
"reliability_score": 0.0,
73-
"combined_score": 0.0,
74-
"error": "Missing run_search function",
70+
71+
error_artifacts = {
72+
"error_type": "MissingFunction",
73+
"error_message": "Program is missing required 'run_search' function",
74+
"suggestion": "Make sure your program includes a function named 'run_search' that returns (x, y, value) or (x, y)"
7575
}
76+
77+
return EvaluationResult(
78+
metrics={
79+
"value_score": 0.0,
80+
"distance_score": 0.0,
81+
"reliability_score": 0.0,
82+
"combined_score": 0.0,
83+
"error": "Missing run_search function",
84+
},
85+
artifacts=error_artifacts
86+
)
7687

7788
# Run multiple trials
7889
num_trials = 10
@@ -159,13 +170,22 @@ def evaluate(program_path):
159170

160171
# If all trials failed, return zero scores
161172
if success_count == 0:
162-
return {
163-
"value_score": 0.0,
164-
"distance_score": 0.0,
165-
"reliability_score": 0.0,
166-
"combined_score": 0.0,
167-
"error": "All trials failed",
173+
error_artifacts = {
174+
"error_type": "AllTrialsFailed",
175+
"error_message": f"All {num_trials} trials failed - common issues: timeouts, crashes, or invalid return values",
176+
"suggestion": "Check for infinite loops, ensure function returns (x, y) or (x, y, value), and verify algorithm terminates within time limit"
168177
}
178+
179+
return EvaluationResult(
180+
metrics={
181+
"value_score": 0.0,
182+
"distance_score": 0.0,
183+
"reliability_score": 0.0,
184+
"combined_score": 0.0,
185+
"error": "All trials failed",
186+
},
187+
artifacts=error_artifacts
188+
)
169189

170190
# Calculate metrics
171191
avg_value = float(np.mean(values))
@@ -194,22 +214,45 @@ def evaluate(program_path):
194214
base_score = 0.5 * value_score + 0.3 * distance_score + 0.2 * reliability_score
195215
combined_score = float(base_score * solution_quality_multiplier)
196216

197-
return {
198-
"value_score": value_score,
199-
"distance_score": distance_score,
200-
"reliability_score": reliability_score,
201-
"combined_score": combined_score,
217+
# Add artifacts for successful runs
218+
artifacts = {
219+
"convergence_info": f"Converged in {num_trials} trials with {success_count} successes",
220+
"best_position": f"Final position: x={x_values[-1]:.4f}, y={y_values[-1]:.4f}" if x_values else "No successful trials",
221+
"average_distance_to_global": f"{avg_distance:.4f}",
222+
"search_efficiency": f"Success rate: {reliability_score:.2%}"
202223
}
224+
225+
return EvaluationResult(
226+
metrics={
227+
"value_score": value_score,
228+
"distance_score": distance_score,
229+
"reliability_score": reliability_score,
230+
"combined_score": combined_score,
231+
},
232+
artifacts=artifacts
233+
)
203234
except Exception as e:
204235
print(f"Evaluation failed completely: {str(e)}")
205236
print(traceback.format_exc())
206-
return {
207-
"value_score": 0.0,
208-
"distance_score": 0.0,
209-
"reliability_score": 0.0,
210-
"combined_score": 0.0,
211-
"error": str(e),
237+
238+
# Create error artifacts
239+
error_artifacts = {
240+
"error_type": type(e).__name__,
241+
"error_message": str(e),
242+
"full_traceback": traceback.format_exc(),
243+
"suggestion": "Check for syntax errors or missing imports in the generated code"
212244
}
245+
246+
return EvaluationResult(
247+
metrics={
248+
"value_score": 0.0,
249+
"distance_score": 0.0,
250+
"reliability_score": 0.0,
251+
"combined_score": 0.0,
252+
"error": str(e),
253+
},
254+
artifacts=error_artifacts
255+
)
213256

214257

215258
# Stage-based evaluation for cascade evaluation
@@ -230,11 +273,21 @@ def evaluate_stage1(program_path):
230273
# Check if the required function exists
231274
if not hasattr(program, "run_search"):
232275
print(f"Stage 1 validation: Program does not have 'run_search' function")
233-
return {
234-
"runs_successfully": 0.0,
235-
"combined_score": 0.0,
236-
"error": "Missing run_search function"
276+
277+
error_artifacts = {
278+
"error_type": "MissingFunction",
279+
"error_message": "Stage 1: Program is missing required 'run_search' function",
280+
"suggestion": "Make sure your program includes a function named 'run_search' that returns (x, y, value) or (x, y)"
237281
}
282+
283+
return EvaluationResult(
284+
metrics={
285+
"runs_successfully": 0.0,
286+
"combined_score": 0.0,
287+
"error": "Missing run_search function"
288+
},
289+
artifacts=error_artifacts
290+
)
238291

239292
try:
240293
# Run a single trial with timeout
@@ -254,18 +307,38 @@ def evaluate_stage1(program_path):
254307
print(
255308
f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got {len(result)}"
256309
)
257-
return {
258-
"runs_successfully": 0.0,
259-
"combined_score": 0.0,
260-
"error": "Invalid result format"
310+
311+
error_artifacts = {
312+
"error_type": "InvalidReturnFormat",
313+
"error_message": f"Stage 1: Function returned tuple with {len(result)} values, expected 2 or 3",
314+
"suggestion": "run_search() must return (x, y) or (x, y, value) - check your return statement"
261315
}
316+
317+
return EvaluationResult(
318+
metrics={
319+
"runs_successfully": 0.0,
320+
"combined_score": 0.0,
321+
"error": "Invalid result format"
322+
},
323+
artifacts=error_artifacts
324+
)
262325
else:
263326
print(f"Stage 1: Invalid result format, expected tuple but got {type(result)}")
264-
return {
265-
"runs_successfully": 0.0,
266-
"combined_score": 0.0,
267-
"error": "Invalid result format"
327+
328+
error_artifacts = {
329+
"error_type": "InvalidReturnType",
330+
"error_message": f"Stage 1: Function returned {type(result)}, expected tuple",
331+
"suggestion": "run_search() must return a tuple like (x, y) or (x, y, value), not a single value or other type"
268332
}
333+
334+
return EvaluationResult(
335+
metrics={
336+
"runs_successfully": 0.0,
337+
"combined_score": 0.0,
338+
"error": "Invalid result format"
339+
},
340+
artifacts=error_artifacts
341+
)
269342

270343
# Ensure all values are float
271344
x = safe_float(x)
@@ -282,11 +355,21 @@ def evaluate_stage1(program_path):
282355
or np.isinf(value)
283356
):
284357
print(f"Stage 1 validation: Invalid result, got x={x}, y={y}, value={value}")
285-
return {
286-
"runs_successfully": 0.5,
287-
"combined_score": 0.0,
288-
"error": "Invalid result values"
358+
359+
error_artifacts = {
360+
"error_type": "InvalidResultValues",
361+
"error_message": f"Stage 1: Got invalid values - x={x}, y={y}, value={value}",
362+
"suggestion": "Function returned NaN or infinite values. Check for division by zero, invalid math operations, or uninitialized variables"
289363
}
364+
365+
return EvaluationResult(
366+
metrics={
367+
"runs_successfully": 0.5,
368+
"combined_score": 0.0,
369+
"error": "Invalid result values"
370+
},
371+
artifacts=error_artifacts
372+
)
290373

291374
# Calculate distance safely
292375
x_diff = float(x) - GLOBAL_MIN_X
@@ -311,45 +394,97 @@ def evaluate_stage1(program_path):
311394
base_score = 0.6 * value_score + 0.4 * distance_score
312395
combined_score = float(base_score * solution_quality_multiplier)
313396

314-
return {
315-
"runs_successfully": 1.0,
316-
"value_score": value_score,
317-
"distance_score": distance_score,
318-
"combined_score": combined_score,
397+
# Add artifacts for successful stage 1
398+
stage1_artifacts = {
399+
"stage1_result": f"Found solution at x={x:.4f}, y={y:.4f} with value={value:.4f}",
400+
"distance_to_global": f"{distance:.4f}",
401+
"solution_quality": f"Distance < 0.5: Very close" if distance < 0.5 else f"Distance < 1.5: Good region" if distance < 1.5 else "Could be improved"
319402
}
403+
404+
return EvaluationResult(
405+
metrics={
406+
"runs_successfully": 1.0,
407+
"value_score": value_score,
408+
"distance_score": distance_score,
409+
"combined_score": combined_score,
410+
},
411+
artifacts=stage1_artifacts
412+
)
320413
except TimeoutError as e:
321414
print(f"Stage 1 evaluation timed out: {e}")
322-
return {
323-
"runs_successfully": 0.0,
324-
"combined_score": 0.0,
325-
"error": "Timeout"
415+
416+
error_artifacts = {
417+
"error_type": "TimeoutError",
418+
"error_message": "Stage 1: Function execution exceeded 5 second timeout",
419+
"suggestion": "Function is likely stuck in infinite loop or doing too much computation. Try reducing iterations or adding early termination conditions"
326420
}
421+
422+
return EvaluationResult(
423+
metrics={
424+
"runs_successfully": 0.0,
425+
"combined_score": 0.0,
426+
"error": "Timeout"
427+
},
428+
artifacts=error_artifacts
429+
)
327430
except IndexError as e:
328431
# Specifically handle IndexError which often happens with early termination checks
329432
print(f"Stage 1 evaluation failed with IndexError: {e}")
330433
print("This is likely due to a list index check before the list is fully populated.")
331-
return {
332-
"runs_successfully": 0.0,
333-
"combined_score": 0.0,
334-
"error": f"IndexError: {str(e)}"
434+
435+
error_artifacts = {
436+
"error_type": "IndexError",
437+
"error_message": f"Stage 1: {str(e)}",
438+
"suggestion": "List index out of range - likely accessing empty list or wrong index. Check list initialization and bounds"
335439
}
440+
441+
return EvaluationResult(
442+
metrics={
443+
"runs_successfully": 0.0,
444+
"combined_score": 0.0,
445+
"error": f"IndexError: {str(e)}"
446+
},
447+
artifacts=error_artifacts
448+
)
336449
except Exception as e:
337450
print(f"Stage 1 evaluation failed: {e}")
338451
print(traceback.format_exc())
339-
return {
340-
"runs_successfully": 0.0,
341-
"combined_score": 0.0,
342-
"error": str(e)
452+
453+
error_artifacts = {
454+
"error_type": type(e).__name__,
455+
"error_message": f"Stage 1: {str(e)}",
456+
"full_traceback": traceback.format_exc(),
457+
"suggestion": "Unexpected error occurred. Check the traceback for specific issue"
343458
}
459+
460+
return EvaluationResult(
461+
metrics={
462+
"runs_successfully": 0.0,
463+
"combined_score": 0.0,
464+
"error": str(e)
465+
},
466+
artifacts=error_artifacts
467+
)
344468

345469
except Exception as e:
346470
print(f"Stage 1 evaluation failed: {e}")
347471
print(traceback.format_exc())
348-
return {
349-
"runs_successfully": 0.0,
350-
"combined_score": 0.0,
351-
"error": str(e)
472+
473+
error_artifacts = {
474+
"error_type": type(e).__name__,
475+
"error_message": f"Stage 1 outer exception: {str(e)}",
476+
"full_traceback": traceback.format_exc(),
477+
"suggestion": "Critical error during stage 1 evaluation. Check program syntax and imports"
352478
}
479+
480+
return EvaluationResult(
481+
metrics={
482+
"runs_successfully": 0.0,
483+
"combined_score": 0.0,
484+
"error": str(e)
485+
},
486+
artifacts=error_artifacts
487+
)
353488

354489

355490
def evaluate_stage2(program_path):

0 commit comments

Comments
 (0)