@@ -69,7 +69,7 @@ def evaluate(program_path):
6969 return {
7070 "value_score" : 0.0 ,
7171 "distance_score" : 0.0 ,
72- "speed_score " : 0.0 ,
72+ "reliability_score " : 0.0 ,
7373 "combined_score" : 0.0 ,
7474 "error" : "Missing run_search function" ,
7575 }
@@ -162,7 +162,7 @@ def evaluate(program_path):
162162 return {
163163 "value_score" : 0.0 ,
164164 "distance_score" : 0.0 ,
165- "speed_score " : 0.0 ,
165+ "reliability_score " : 0.0 ,
166166 "combined_score" : 0.0 ,
167167 "error" : "All trials failed" ,
168168 }
@@ -173,65 +173,40 @@ def evaluate(program_path):
173173 avg_time = float (np .mean (times )) if times else 1.0
174174
175175 # Convert to scores (higher is better)
176- value_score = float (1.0 / (1.0 + abs (avg_value - GLOBAL_MIN_VALUE ))) # Normalize and invert
176+ value_score = float (1.0 / (1.0 + abs (avg_value - GLOBAL_MIN_VALUE )))
177177 distance_score = float (1.0 / (1.0 + avg_distance ))
178- speed_score = float (1.0 / avg_time ) if avg_time > 0 else 0.0
179-
180- # calculate standard deviation scores
181- # get x_std_score
182- x_std_score = float (1.0 / (1.0 + np .std (x_values )))
183- # get y_std_score
184- y_std_score = float (1.0 / (1.0 + np .std (y_values )))
185- standard_deviation_score = (x_std_score + y_std_score ) / 2.0
186-
187- # Normalize speed score (so it doesn't dominate)
188- speed_score = float (min (speed_score , 10.0 ) / 10.0 )
189-
178+
190179 # Add reliability score based on success rate
191180 reliability_score = float (success_count / num_trials )
192181
193- # Calculate a single combined score that prioritizes finding good solutions
194- # over secondary metrics like speed and reliability
195- # Value and distance scores (quality of solution) get 90% of the weight
196- # Speed and reliability get only 10% combined
197- combined_score = float (
198- 0.35 * value_score
199- + 0.35 * distance_score
200- + standard_deviation_score * 0.20
201- + 0.05 * speed_score
202- + 0.05 * reliability_score
203- )
204-
205- # Also compute an "overall" score that will be the primary metric for selection
206- # This adds a bonus for finding solutions close to the global minimum
207- # and heavily penalizes solutions that aren't finding the right region
208- if distance_to_global < 1.0 : # Very close to the correct solution
209- solution_quality = 1.0
210- elif distance_to_global < 3.0 : # In the right region
211- solution_quality = 0.5
182+ # Calculate solution quality based on distance to global minimum
183+ if avg_distance < 0.5 : # Very close to the correct solution
184+ solution_quality_multiplier = 1.5 # 50% bonus
185+ elif avg_distance < 1.5 : # In the right region
186+ solution_quality_multiplier = 1.2 # 20% bonus
187+ elif avg_distance < 3.0 : # Getting closer
188+ solution_quality_multiplier = 1.0 # No adjustment
212189 else : # Not finding the right region
213- solution_quality = 0.1
190+ solution_quality_multiplier = 0.7 # 30% penalty
214191
215- # Overall score is dominated by solution quality but also factors in the combined score
216- overall_score = 0.8 * solution_quality + 0.2 * combined_score
192+ # Calculate combined score that prioritizes finding the global minimum
193+ # Base score from value and distance, then apply solution quality multiplier
194+ base_score = 0.5 * value_score + 0.3 * distance_score + 0.2 * reliability_score
195+ combined_score = float (base_score * solution_quality_multiplier )
217196
218197 return {
219198 "value_score" : value_score ,
220199 "distance_score" : distance_score ,
221- "standard_deviation_score" : standard_deviation_score ,
222- "speed_score" : speed_score ,
223200 "reliability_score" : reliability_score ,
224201 "combined_score" : combined_score ,
225- "overall_score" : overall_score , # This will be the primary selection metric
226- "success_rate" : reliability_score ,
227202 }
228203 except Exception as e :
229204 print (f"Evaluation failed completely: { str (e )} " )
230205 print (traceback .format_exc ())
231206 return {
232207 "value_score" : 0.0 ,
233208 "distance_score" : 0.0 ,
234- "speed_score " : 0.0 ,
209+ "reliability_score " : 0.0 ,
235210 "combined_score" : 0.0 ,
236211 "error" : str (e ),
237212 }
@@ -255,7 +230,11 @@ def evaluate_stage1(program_path):
255230 # Check if the required function exists
256231 if not hasattr (program , "run_search" ):
257232 print (f"Stage 1 validation: Program does not have 'run_search' function" )
258- return {"runs_successfully" : 0.0 , "error" : "Missing run_search function" }
233+ return {
234+ "runs_successfully" : 0.0 ,
235+ "combined_score" : 0.0 ,
236+ "error" : "Missing run_search function"
237+ }
259238
260239 try :
261240 # Run a single trial with timeout
@@ -275,10 +254,18 @@ def evaluate_stage1(program_path):
275254 print (
276255 f"Stage 1: Invalid result format, expected tuple of 2 or 3 values but got { len (result )} "
277256 )
278- return {"runs_successfully" : 0.0 , "error" : "Invalid result format" }
257+ return {
258+ "runs_successfully" : 0.0 ,
259+ "combined_score" : 0.0 ,
260+ "error" : "Invalid result format"
261+ }
279262 else :
280263 print (f"Stage 1: Invalid result format, expected tuple but got { type (result )} " )
281- return {"runs_successfully" : 0.0 , "error" : "Invalid result format" }
264+ return {
265+ "runs_successfully" : 0.0 ,
266+ "combined_score" : 0.0 ,
267+ "error" : "Invalid result format"
268+ }
282269
283270 # Ensure all values are float
284271 x = safe_float (x )
@@ -295,7 +282,11 @@ def evaluate_stage1(program_path):
295282 or np .isinf (value )
296283 ):
297284 print (f"Stage 1 validation: Invalid result, got x={ x } , y={ y } , value={ value } " )
298- return {"runs_successfully" : 0.5 , "error" : "Invalid result values" }
285+ return {
286+ "runs_successfully" : 0.5 ,
287+ "combined_score" : 0.0 ,
288+ "error" : "Invalid result values"
289+ }
299290
300291 # Calculate distance safely
301292 x_diff = float (x ) - GLOBAL_MIN_X
@@ -306,38 +297,59 @@ def evaluate_stage1(program_path):
306297 value_score = float (1.0 / (1.0 + abs (value - GLOBAL_MIN_VALUE )))
307298 distance_score = float (1.0 / (1.0 + distance ))
308299
309- # Calculate solution quality metric
310- if distance < 1.0 : # Very close to the correct solution
311- solution_quality = 1.0
312- elif distance < 3.0 : # In the right region
313- solution_quality = 0.5
300+ # Calculate solution quality based on distance to global minimum
301+ if distance < 0.5 : # Very close to the correct solution
302+ solution_quality_multiplier = 1.4 # 40% bonus
303+ elif distance < 1.5 : # In the right region
304+ solution_quality_multiplier = 1.15 # 15% bonus
305+ elif distance < 3.0 : # Getting closer
306+ solution_quality_multiplier = 1.0 # No adjustment
314307 else : # Not finding the right region
315- solution_quality = 0.1
308+ solution_quality_multiplier = 0.8 # 20% penalty
309+
310+ # Calculate combined score for stage 1
311+ base_score = 0.6 * value_score + 0.4 * distance_score
312+ combined_score = float (base_score * solution_quality_multiplier )
316313
317- # Basic metrics with overall score
318314 return {
319315 "runs_successfully" : 1.0 ,
320316 "value_score" : value_score ,
321317 "distance_score" : distance_score ,
322- "overall_score " : solution_quality , # This becomes a strong guiding metric
318+ "combined_score " : combined_score ,
323319 }
324320 except TimeoutError as e :
325321 print (f"Stage 1 evaluation timed out: { e } " )
326- return {"runs_successfully" : 0.0 , "error" : "Timeout" }
322+ return {
323+ "runs_successfully" : 0.0 ,
324+ "combined_score" : 0.0 ,
325+ "error" : "Timeout"
326+ }
327327 except IndexError as e :
328328 # Specifically handle IndexError which often happens with early termination checks
329329 print (f"Stage 1 evaluation failed with IndexError: { e } " )
330330 print ("This is likely due to a list index check before the list is fully populated." )
331- return {"runs_successfully" : 0.0 , "error" : f"IndexError: { str (e )} " }
331+ return {
332+ "runs_successfully" : 0.0 ,
333+ "combined_score" : 0.0 ,
334+ "error" : f"IndexError: { str (e )} "
335+ }
332336 except Exception as e :
333337 print (f"Stage 1 evaluation failed: { e } " )
334338 print (traceback .format_exc ())
335- return {"runs_successfully" : 0.0 , "error" : str (e )}
339+ return {
340+ "runs_successfully" : 0.0 ,
341+ "combined_score" : 0.0 ,
342+ "error" : str (e )
343+ }
336344
337345 except Exception as e :
338346 print (f"Stage 1 evaluation failed: { e } " )
339347 print (traceback .format_exc ())
340- return {"runs_successfully" : 0.0 , "error" : str (e )}
348+ return {
349+ "runs_successfully" : 0.0 ,
350+ "combined_score" : 0.0 ,
351+ "error" : str (e )
352+ }
341353
342354
343355def evaluate_stage2 (program_path ):
0 commit comments