11"""
22Evaluator for the function minimization example
33"""
4+
45import importlib .util
56import numpy as np
67import time
910import traceback
1011import sys
1112
13+
1214def run_with_timeout (func , args = (), kwargs = {}, timeout_seconds = 5 ):
1315 """
1416 Run a function with a timeout using concurrent.futures
15-
17+
1618 Args:
1719 func: Function to run
1820 args: Arguments to pass to the function
1921 kwargs: Keyword arguments to pass to the function
2022 timeout_seconds: Timeout in seconds
21-
23+
2224 Returns:
2325 Result of the function or raises TimeoutError
2426 """
@@ -27,7 +29,10 @@ def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=5):
2729 try :
2830 return future .result (timeout = timeout_seconds )
2931 except concurrent .futures .TimeoutError :
30- raise TimeoutError (f"Function { func .__name__ } timed out after { timeout_seconds } seconds" )
32+ raise TimeoutError (
33+ f"Function { func .__name__ } timed out after { timeout_seconds } seconds"
34+ )
35+
3136
3237def safe_float (value ):
3338 """Convert a value to float safely"""
@@ -37,28 +42,29 @@ def safe_float(value):
3742 print (f"Warning: Could not convert { value } of type { type (value )} to float" )
3843 return 0.0
3944
45+
4046def evaluate (program_path ):
4147 """
4248 Evaluate the program by running it multiple times and checking how close
4349 it gets to the known global minimum.
44-
50+
4551 Args:
4652 program_path: Path to the program file
47-
53+
4854 Returns:
4955 Dictionary of metrics
5056 """
5157 # Known global minimum (approximate)
5258 GLOBAL_MIN_X = - 1.76
5359 GLOBAL_MIN_Y = - 1.03
5460 GLOBAL_MIN_VALUE = - 2.104
55-
61+
5662 try :
5763 # Load the program
5864 spec = importlib .util .spec_from_file_location ("program" , program_path )
5965 program = importlib .util .module_from_spec (spec )
6066 spec .loader .exec_module (program )
61-
67+
6268 # Check if the required function exists
6369 if not hasattr (program , "run_search" ):
6470 print (f"Error: program does not have 'run_search' function" )
@@ -67,99 +73,111 @@ def evaluate(program_path):
6773 "distance_score" : 0.0 ,
6874 "speed_score" : 0.0 ,
6975 "combined_score" : 0.0 ,
70- "error" : "Missing run_search function"
76+ "error" : "Missing run_search function" ,
7177 }
72-
78+
7379 # Run multiple trials
7480 num_trials = 10
7581 values = []
7682 distances = []
7783 times = []
7884 success_count = 0
79-
85+
8086 for trial in range (num_trials ):
8187 try :
8288 start_time = time .time ()
83-
89+
8490 # Run with timeout
8591 result = run_with_timeout (program .run_search , timeout_seconds = 5 )
86-
92+
8793 # Check if we got a tuple of 3 values
8894 if not isinstance (result , tuple ) or len (result ) != 3 :
89- print (f"Trial { trial } : Invalid result format, expected tuple of 3 values but got { type (result )} " )
95+ print (
96+ f"Trial { trial } : Invalid result format, expected tuple of 3 values but got { type (result )} "
97+ )
9098 continue
91-
99+
92100 x , y , value = result
93-
101+
94102 end_time = time .time ()
95-
103+
96104 # Ensure all values are float
97105 x = safe_float (x )
98106 y = safe_float (y )
99107 value = safe_float (value )
100-
108+
101109 # Check if the result is valid (not NaN or infinite)
102- if (np .isnan (x ) or np .isnan (y ) or np .isnan (value ) or
103- np .isinf (x ) or np .isinf (y ) or np .isinf (value )):
110+ if (
111+ np .isnan (x )
112+ or np .isnan (y )
113+ or np .isnan (value )
114+ or np .isinf (x )
115+ or np .isinf (y )
116+ or np .isinf (value )
117+ ):
104118 print (f"Trial { trial } : Invalid result, got x={ x } , y={ y } , value={ value } " )
105119 continue
106-
120+
107121 # Calculate metrics
108122 x_diff = safe_float (x ) - GLOBAL_MIN_X
109123 y_diff = safe_float (y ) - GLOBAL_MIN_Y
110124 distance_to_global = np .sqrt (x_diff ** 2 + y_diff ** 2 )
111125 value_difference = abs (value - GLOBAL_MIN_VALUE )
112-
126+
113127 values .append (float (value ))
114128 distances .append (float (distance_to_global ))
115129 times .append (float (end_time - start_time ))
116130 success_count += 1
117-
131+
118132 except TimeoutError as e :
119133 print (f"Trial { trial } : { str (e )} " )
120134 continue
121135 except IndexError as e :
122136 # Specifically handle IndexError which often happens with early termination checks
123137 print (f"Trial { trial } : IndexError - { str (e )} " )
124- print ("This is likely due to a list index check before the list is fully populated." )
138+ print (
139+ "This is likely due to a list index check before the list is fully populated."
140+ )
125141 continue
126142 except Exception as e :
127143 print (f"Trial { trial } : Error - { str (e )} " )
128144 print (traceback .format_exc ())
129145 continue
130-
146+
131147 # If all trials failed, return zero scores
132148 if success_count == 0 :
133149 return {
134150 "value_score" : 0.0 ,
135151 "distance_score" : 0.0 ,
136152 "speed_score" : 0.0 ,
137153 "combined_score" : 0.0 ,
138- "error" : "All trials failed"
154+ "error" : "All trials failed" ,
139155 }
140-
156+
141157 # Calculate metrics
142158 avg_value = float (np .mean (values ))
143159 avg_distance = float (np .mean (distances ))
144160 avg_time = float (np .mean (times )) if times else 1.0
145-
161+
146162 # Convert to scores (higher is better)
147163 value_score = float (1.0 / (1.0 + abs (avg_value - GLOBAL_MIN_VALUE ))) # Normalize and invert
148164 distance_score = float (1.0 / (1.0 + avg_distance ))
149165 speed_score = float (1.0 / avg_time ) if avg_time > 0 else 0.0
150-
166+
151167 # Normalize speed score (so it doesn't dominate)
152168 speed_score = float (min (speed_score , 10.0 ) / 10.0 )
153-
169+
154170 # Add reliability score based on success rate
155171 reliability_score = float (success_count / num_trials )
156-
172+
157173 # Calculate a single combined score that prioritizes finding good solutions
158174 # over secondary metrics like speed and reliability
159175 # Value and distance scores (quality of solution) get 90% of the weight
160176 # Speed and reliability get only 10% combined
161- combined_score = float (0.6 * value_score + 0.3 * distance_score + 0.05 * speed_score + 0.05 * reliability_score )
162-
177+ combined_score = float (
178+ 0.6 * value_score + 0.3 * distance_score + 0.05 * speed_score + 0.05 * reliability_score
179+ )
180+
163181 # Also compute an "overall" score that will be the primary metric for selection
164182 # This adds a bonus for finding solutions close to the global minimum
165183 # and heavily penalizes solutions that aren't finding the right region
@@ -169,18 +187,18 @@ def evaluate(program_path):
169187 solution_quality = 0.5
170188 else : # Not finding the right region
171189 solution_quality = 0.1
172-
190+
173191 # Overall score is dominated by solution quality but also factors in the combined score
174192 overall_score = 0.8 * solution_quality + 0.2 * combined_score
175-
193+
176194 return {
177195 "value_score" : value_score ,
178196 "distance_score" : distance_score ,
179197 "speed_score" : speed_score ,
180198 "reliability_score" : reliability_score ,
181199 "combined_score" : combined_score ,
182200 "overall_score" : overall_score , # This will be the primary selection metric
183- "success_rate" : reliability_score
201+ "success_rate" : reliability_score ,
184202 }
185203 except Exception as e :
186204 print (f"Evaluation failed completely: { str (e )} " )
@@ -190,75 +208,85 @@ def evaluate(program_path):
190208 "distance_score" : 0.0 ,
191209 "speed_score" : 0.0 ,
192210 "combined_score" : 0.0 ,
193- "error" : str (e )
211+ "error" : str (e ),
194212 }
195213
214+
196215# Stage-based evaluation for cascade evaluation
197216def evaluate_stage1 (program_path ):
198217 """First stage evaluation with fewer trials"""
199218 # Known global minimum (approximate)
200219 GLOBAL_MIN_X = float (- 1.76 )
201220 GLOBAL_MIN_Y = float (- 1.03 )
202221 GLOBAL_MIN_VALUE = float (- 2.104 )
203-
222+
204223 # Quick check to see if the program runs without errors
205224 try :
206225 # Load the program
207226 spec = importlib .util .spec_from_file_location ("program" , program_path )
208227 program = importlib .util .module_from_spec (spec )
209228 spec .loader .exec_module (program )
210-
229+
211230 # Check if the required function exists
212231 if not hasattr (program , "run_search" ):
213232 print (f"Stage 1 validation: Program does not have 'run_search' function" )
214233 return {"runs_successfully" : 0.0 , "error" : "Missing run_search function" }
215-
234+
216235 try :
217236 # Run a single trial with timeout
218237 result = run_with_timeout (program .run_search , timeout_seconds = 5 )
219-
238+
220239 # Check if we got a tuple of 3 values
221240 if not isinstance (result , tuple ) or len (result ) != 3 :
222- print (f"Stage 1: Invalid result format, expected tuple of 3 values but got { type (result )} " )
241+ print (
242+ f"Stage 1: Invalid result format, expected tuple of 3 values but got { type (result )} "
243+ )
223244 return {"runs_successfully" : 0.0 , "error" : "Invalid result format" }
224-
245+
225246 x , y , value = result
226-
247+
227248 # Ensure all values are float
228249 x = safe_float (x )
229250 y = safe_float (y )
230251 value = safe_float (value )
231-
252+
232253 # Check if the result is valid
233- if np .isnan (x ) or np .isnan (y ) or np .isnan (value ) or np .isinf (x ) or np .isinf (y ) or np .isinf (value ):
254+ if (
255+ np .isnan (x )
256+ or np .isnan (y )
257+ or np .isnan (value )
258+ or np .isinf (x )
259+ or np .isinf (y )
260+ or np .isinf (value )
261+ ):
234262 print (f"Stage 1 validation: Invalid result, got x={ x } , y={ y } , value={ value } " )
235263 return {"runs_successfully" : 0.5 , "error" : "Invalid result values" }
236-
264+
237265 # Calculate distance safely
238266 x_diff = float (x ) - GLOBAL_MIN_X
239267 y_diff = float (y ) - GLOBAL_MIN_Y
240268 distance = float (np .sqrt (x_diff ** 2 + y_diff ** 2 ))
241-
269+
242270 # Calculate value-based score
243271 value_score = float (1.0 / (1.0 + abs (value - GLOBAL_MIN_VALUE )))
244272 distance_score = float (1.0 / (1.0 + distance ))
245-
273+
246274 # Calculate solution quality metric
247275 if distance < 1.0 : # Very close to the correct solution
248276 solution_quality = 1.0
249277 elif distance < 3.0 : # In the right region
250278 solution_quality = 0.5
251279 else : # Not finding the right region
252280 solution_quality = 0.1
253-
281+
254282 # Basic metrics with overall score
255283 return {
256284 "runs_successfully" : 1.0 ,
257285 "value" : float (value ),
258286 "distance" : distance ,
259287 "value_score" : value_score ,
260288 "distance_score" : distance_score ,
261- "overall_score" : solution_quality # This becomes a strong guiding metric
289+ "overall_score" : solution_quality , # This becomes a strong guiding metric
262290 }
263291 except TimeoutError as e :
264292 print (f"Stage 1 evaluation timed out: { e } " )
@@ -272,12 +300,13 @@ def evaluate_stage1(program_path):
272300 print (f"Stage 1 evaluation failed: { e } " )
273301 print (traceback .format_exc ())
274302 return {"runs_successfully" : 0.0 , "error" : str (e )}
275-
303+
276304 except Exception as e :
277305 print (f"Stage 1 evaluation failed: { e } " )
278306 print (traceback .format_exc ())
279307 return {"runs_successfully" : 0.0 , "error" : str (e )}
280308
309+
281310def evaluate_stage2 (program_path ):
282311 """Second stage evaluation with more thorough testing"""
283312 # Full evaluation as in the main evaluate function
0 commit comments