@@ -1099,7 +1099,148 @@ def aggregate(
10991099 )
11001100
11011101
1102+ class CustomCodeExecutionMetricHandler (MetricHandler ):
1103+ """Metric handler for custom code execution metrics."""
1104+
1105+ def __init__ (self , module : "evals.Evals" , metric : types .Metric ):
1106+ super ().__init__ (module = module , metric = metric )
1107+
1108+ if not self .metric .remote_custom_function :
1109+ raise ValueError (
1110+ f"CustomCodeExecutionMetricHandler for '{ self .metric .name } ' needs "
1111+ " Metric.remote_custom_function to be set."
1112+ )
1113+
1114+ def _build_request_payload (
1115+ self , eval_case : types .EvalCase , response_index : int
1116+ ) -> dict [str , Any ]:
1117+ """Builds the request parameters for evaluate instances request."""
1118+ if not eval_case .responses or response_index >= len (eval_case .responses ):
1119+ raise IndexError (f"response_index { response_index } is out of bounds." )
1120+
1121+ response_content = eval_case .responses [response_index ].response
1122+ if not response_content :
1123+ raise ValueError (
1124+ f"Response content missing for candidate { response_index } ."
1125+ )
1126+
1127+ reference_instance_data = None
1128+ if eval_case .reference :
1129+ reference_instance_data = PredefinedMetricHandler ._content_to_instance_data (
1130+ eval_case .reference .response
1131+ )
1132+
1133+ prompt_instance_data = PredefinedMetricHandler ._content_to_instance_data (
1134+ eval_case .prompt
1135+ )
1136+
1137+ instance_payload = types .EvaluationInstance (
1138+ prompt = prompt_instance_data ,
1139+ response = PredefinedMetricHandler ._content_to_instance_data (
1140+ response_content
1141+ ),
1142+ reference = reference_instance_data ,
1143+ )
1144+
1145+ return {
1146+ "instance" : instance_payload ,
1147+ }
1148+
1149+ @override
1150+ def get_metric_result (
1151+ self , eval_case : types .EvalCase , response_index : int
1152+ ) -> types .EvalCaseMetricResult :
1153+ """Processes a single evaluation case for a specific custom code execution metric."""
1154+ metric_name = self .metric .name
1155+ try :
1156+ payload = self ._build_request_payload (eval_case , response_index )
1157+ for attempt in range (_MAX_RETRIES ):
1158+ try :
1159+ api_response = self .module ._evaluate_instances (
1160+ metrics = [self .metric ],
1161+ instance = payload .get ("instance" ),
1162+ )
1163+ break
1164+ except genai_errors .ClientError as e :
1165+ if e .code == 429 :
1166+ logger .warning (
1167+ "Resource Exhausted error on attempt %d/%d: %s. Retrying in %s"
1168+ " seconds..." ,
1169+ attempt + 1 ,
1170+ _MAX_RETRIES ,
1171+ e ,
1172+ 2 ** attempt ,
1173+ )
1174+ if attempt == _MAX_RETRIES - 1 :
1175+ return types .EvalCaseMetricResult (
1176+ metric_name = metric_name ,
1177+ error_message = f"Resource exhausted after { _MAX_RETRIES } retries: { e } " ,
1178+ )
1179+ time .sleep (2 ** attempt )
1180+ else :
1181+ raise e
1182+
1183+ if (
1184+ api_response
1185+ and hasattr (api_response , "metric_results" )
1186+ and api_response .metric_results
1187+ ):
1188+ result_data = api_response .metric_results [0 ]
1189+
1190+ error_message = None
1191+ if result_data .error and getattr (result_data .error , "code" ):
1192+ error_message = f"Error in metric result: { result_data .error } "
1193+ return types .EvalCaseMetricResult (
1194+ metric_name = metric_name ,
1195+ score = result_data .score ,
1196+ explanation = result_data .explanation ,
1197+ error_message = error_message ,
1198+ )
1199+ else :
1200+ logger .error (
1201+ "Metric results missing in API response for metric '%s'."
1202+ " API response: %s" ,
1203+ metric_name ,
1204+ (
1205+ api_response .model_dump_json (exclude_none = True )
1206+ if api_response
1207+ else "None"
1208+ ),
1209+ )
1210+ return types .EvalCaseMetricResult (
1211+ metric_name = metric_name ,
1212+ error_message = "Metric results missing in API response." ,
1213+ )
1214+ except Exception as e : # pylint: disable=broad-exception-caught
1215+ logger .error (
1216+ "Error processing metric %s for case %s: %s" ,
1217+ metric_name ,
1218+ eval_case .eval_case_id ,
1219+ e ,
1220+ exc_info = True ,
1221+ )
1222+ return types .EvalCaseMetricResult (
1223+ metric_name = metric_name , error_message = str (e )
1224+ )
1225+
1226+ @override
1227+ def aggregate (
1228+ self , eval_case_metric_results : list [types .EvalCaseMetricResult ]
1229+ ) -> types .AggregatedMetricResult :
1230+ """Aggregates the metric results for a custom code execution metric."""
1231+ logger .debug (
1232+ "Aggregating results for custom code execution metric: %s" , self .metric .name
1233+ )
1234+ return _default_aggregate_scores (
1235+ self .metric .name , eval_case_metric_results , calculate_pass_rate = True
1236+ )
1237+
1238+
11021239_METRIC_HANDLER_MAPPING = [
1240+ (
1241+ lambda m : hasattr (m , "remote_custom_function" ) and m .remote_custom_function ,
1242+ CustomCodeExecutionMetricHandler ,
1243+ ),
11031244 (
11041245 lambda m : m .custom_function and isinstance (m .custom_function , Callable ),
11051246 CustomMetricHandler ,
@@ -1125,6 +1266,7 @@ def aggregate(
11251266 TranslationMetricHandler ,
11261267 LLMMetricHandler ,
11271268 CustomMetricHandler ,
1269+ CustomCodeExecutionMetricHandler ,
11281270 PredefinedMetricHandler ,
11291271)
11301272
0 commit comments