1+ """
2+ Evaluator for R robust regression example
3+ """
4+
5+ import asyncio
6+ import json
7+ import os
8+ import subprocess
9+ import tempfile
10+ import time
11+ from pathlib import Path
12+ from typing import Dict , Any
13+
14+ import numpy as np
15+
16+ from openevolve .evaluation_result import EvaluationResult
17+
18+
19+ async def evaluate (program_path : str ) -> EvaluationResult :
20+ """
21+ Evaluate an R program implementing robust regression.
22+
23+ Tests the program on synthetic data with outliers to measure:
24+ - Accuracy (MSE, MAE, R-squared)
25+ - Robustness to outliers
26+ - Computational efficiency
27+ """
28+ try :
29+ # Generate test datasets with different outlier levels
30+ test_cases = [
31+ generate_regression_data (n_samples = 100 , n_features = 3 , outlier_fraction = 0.0 , noise = 0.1 ),
32+ generate_regression_data (n_samples = 100 , n_features = 3 , outlier_fraction = 0.1 , noise = 0.1 ),
33+ generate_regression_data (n_samples = 100 , n_features = 3 , outlier_fraction = 0.2 , noise = 0.1 ),
34+ generate_regression_data (n_samples = 200 , n_features = 5 , outlier_fraction = 0.15 , noise = 0.2 ),
35+ ]
36+
37+ total_score = 0
38+ total_mse = 0
39+ total_mae = 0
40+ total_medae = 0
41+ total_r_squared = 0
42+ total_outlier_robustness = 0
43+ total_time = 0
44+
45+ artifacts = {"test_results" : []}
46+
47+ for i , (X , y , true_coeffs ) in enumerate (test_cases ):
48+ # Create a temporary R script that sources the program and runs it
49+ with tempfile .NamedTemporaryFile (mode = 'w' , suffix = '.r' , delete = False ) as f :
50+ f .write (f"""
51+ # Source the program
52+ source("{ program_path } ")
53+
54+ # Load test data
55+ X <- as.matrix(read.csv("{ X } ", header=FALSE))
56+ y <- as.vector(as.matrix(read.csv("{ y } ", header=FALSE)))
57+
58+ # Time the execution
59+ start_time <- Sys.time()
60+ metrics <- main()
61+ end_time <- Sys.time()
62+ exec_time <- as.numeric(end_time - start_time, units="secs")
63+
64+ # Add execution time
65+ metrics$execution_time <- exec_time
66+
67+ # Save results
68+ write(jsonlite::toJSON(metrics, auto_unbox=TRUE), "results.json")
69+ """ )
70+ test_script = f .name
71+
72+ # Save test data to temporary CSV files
73+ X_file = tempfile .NamedTemporaryFile (mode = 'w' , suffix = '.csv' , delete = False )
74+ y_file = tempfile .NamedTemporaryFile (mode = 'w' , suffix = '.csv' , delete = False )
75+ np .savetxt (X_file .name , X , delimiter = ',' , fmt = '%.6f' )
76+ np .savetxt (y_file .name , y , delimiter = ',' , fmt = '%.6f' )
77+ X_file .close ()
78+ y_file .close ()
79+
80+ # Run the R script
81+ try :
82+ result = subprocess .run (
83+ ['Rscript' , test_script ],
84+ capture_output = True ,
85+ text = True ,
86+ timeout = 30 ,
87+ cwd = os .path .dirname (test_script )
88+ )
89+
90+ if result .returncode != 0 :
91+ artifacts ["test_results" ].append ({
92+ "test_case" : i ,
93+ "error" : "R execution failed" ,
94+ "stderr" : result .stderr
95+ })
96+ continue
97+
98+ # Read results
99+ results_path = os .path .join (os .path .dirname (test_script ), 'results.json' )
100+ if not os .path .exists (results_path ):
101+ artifacts ["test_results" ].append ({
102+ "test_case" : i ,
103+ "error" : "No results file produced"
104+ })
105+ continue
106+
107+ with open (results_path , 'r' ) as f :
108+ metrics = json .load (f )
109+
110+ # Calculate case score (emphasize robustness for cases with outliers)
111+ outlier_fraction = [0.0 , 0.1 , 0.2 , 0.15 ][i ]
112+ if outlier_fraction > 0 :
113+ # For cases with outliers, prioritize robust metrics
114+ case_score = (
115+ 0.2 * (1 - min (metrics .get ('mse' , 1 ), 1 )) +
116+ 0.3 * (1 - min (metrics .get ('medae' , 1 ), 1 )) +
117+ 0.4 * metrics .get ('outlier_robustness' , 0 ) +
118+ 0.1 * max (0 , metrics .get ('r_squared' , 0 ))
119+ )
120+ else :
121+ # For clean data, prioritize accuracy
122+ case_score = (
123+ 0.4 * (1 - min (metrics .get ('mse' , 1 ), 1 )) +
124+ 0.3 * (1 - min (metrics .get ('mae' , 1 ), 1 )) +
125+ 0.2 * max (0 , metrics .get ('r_squared' , 0 )) +
126+ 0.1 * metrics .get ('outlier_robustness' , 0 )
127+ )
128+
129+ total_score += case_score
130+ total_mse += metrics .get ('mse' , 1 )
131+ total_mae += metrics .get ('mae' , 1 )
132+ total_medae += metrics .get ('medae' , 1 )
133+ total_r_squared += max (0 , metrics .get ('r_squared' , 0 ))
134+ total_outlier_robustness += metrics .get ('outlier_robustness' , 0 )
135+ total_time += metrics .get ('execution_time' , 1 )
136+
137+ artifacts ["test_results" ].append ({
138+ "test_case" : i ,
139+ "outlier_fraction" : outlier_fraction ,
140+ "metrics" : metrics ,
141+ "case_score" : case_score
142+ })
143+
144+ except subprocess .TimeoutExpired :
145+ artifacts ["test_results" ].append ({
146+ "test_case" : i ,
147+ "error" : "Timeout"
148+ })
149+ except Exception as e :
150+ artifacts ["test_results" ].append ({
151+ "test_case" : i ,
152+ "error" : str (e )
153+ })
154+ finally :
155+ # Cleanup
156+ os .unlink (test_script )
157+ os .unlink (X_file .name )
158+ os .unlink (y_file .name )
159+ if os .path .exists (os .path .join (os .path .dirname (test_script ), 'results.json' )):
160+ os .unlink (os .path .join (os .path .dirname (test_script ), 'results.json' ))
161+
162+ # Calculate average metrics
163+ n_successful = len ([r for r in artifacts ["test_results" ] if "error" not in r ])
164+ if n_successful == 0 :
165+ return EvaluationResult (
166+ metrics = {
167+ "score" : 0.0 ,
168+ "mse" : float ('inf' ),
169+ "mae" : float ('inf' ),
170+ "medae" : float ('inf' ),
171+ "r_squared" : 0.0 ,
172+ "outlier_robustness" : 0.0 ,
173+ "execution_time" : float ('inf' )
174+ },
175+ artifacts = artifacts
176+ )
177+
178+ avg_score = total_score / n_successful
179+ avg_mse = total_mse / n_successful
180+ avg_mae = total_mae / n_successful
181+ avg_medae = total_medae / n_successful
182+ avg_r_squared = total_r_squared / n_successful
183+ avg_outlier_robustness = total_outlier_robustness / n_successful
184+ avg_time = total_time / n_successful
185+
186+ # Add efficiency bonus for faster execution
187+ efficiency_bonus = max (0 , 1 - avg_time ) * 0.1
188+ final_score = min (1.0 , avg_score + efficiency_bonus )
189+
190+ return EvaluationResult (
191+ metrics = {
192+ "score" : final_score ,
193+ "mse" : avg_mse ,
194+ "mae" : avg_mae ,
195+ "medae" : avg_medae ,
196+ "r_squared" : avg_r_squared ,
197+ "outlier_robustness" : avg_outlier_robustness ,
198+ "execution_time" : avg_time
199+ },
200+ artifacts = artifacts
201+ )
202+
203+ except Exception as e :
204+ return EvaluationResult (
205+ metrics = {
206+ "score" : 0.0 ,
207+ "mse" : float ('inf' ),
208+ "mae" : float ('inf' ),
209+ "medae" : float ('inf' ),
210+ "r_squared" : 0.0 ,
211+ "outlier_robustness" : 0.0 ,
212+ "execution_time" : float ('inf' )
213+ },
214+ artifacts = {"error" : str (e ), "type" : "evaluation_error" }
215+ )
216+
217+
218+ def generate_regression_data (n_samples = 100 , n_features = 3 , outlier_fraction = 0.1 , noise = 0.1 ):
219+ """Generate synthetic regression data with outliers."""
220+ np .random .seed (42 )
221+
222+ # Generate features
223+ X = np .random .randn (n_samples , n_features )
224+
225+ # True coefficients
226+ true_coeffs = np .random .randn (n_features + 1 ) # +1 for intercept
227+
228+ # Generate target values
229+ y = true_coeffs [0 ] + X @ true_coeffs [1 :] + noise * np .random .randn (n_samples )
230+
231+ # Add outliers
232+ n_outliers = int (n_samples * outlier_fraction )
233+ if n_outliers > 0 :
234+ outlier_indices = np .random .choice (n_samples , n_outliers , replace = False )
235+ # Make outliers by adding large errors
236+ y [outlier_indices ] += np .random .choice ([- 1 , 1 ], n_outliers ) * np .random .uniform (3 , 10 , n_outliers )
237+
238+ return X , y , true_coeffs
239+
240+
241+ # For testing
242+ if __name__ == "__main__" :
243+ import sys
244+ if len (sys .argv ) > 1 :
245+ result = asyncio .run (evaluate (sys .argv [1 ]))
246+ print (f"Score: { result .metrics ['score' ]:.4f} " )
247+ print (f"MSE: { result .metrics ['mse' ]:.4f} " )
248+ print (f"Outlier Robustness: { result .metrics ['outlier_robustness' ]:.4f} " )
0 commit comments