@@ -214,12 +214,18 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
214
214
score = math .nan
215
215
if llm_output :
216
216
score , reason = parse_quality_evaluator_reason_score (llm_output , valid_score_range = "[0-1]" )
217
- return {
218
- self ._result_key : bool (float (score )),
219
- f"{ self ._result_key } _reason" : reason ,
220
- "tool_call_id" : eval_input .get ("tool_call" ).get ("tool_call_id" ),
221
- }
222
- return {self ._result_key : float (score )}
217
+ if score >= 0 and score <= 1 :
218
+ return {
219
+ self ._result_key : bool (float (score )),
220
+ f"{ self ._result_key } _reason" : reason ,
221
+ "tool_call_id" : eval_input .get ("tool_call" ).get ("tool_call_id" ),
222
+ }
223
+ raise EvaluationException (
224
+ message = "Tool call accuracy evaluator: Invalid score returned from LLM." ,
225
+ blame = ErrorBlame .SYSTEM_ERROR ,
226
+ category = ErrorCategory .INVALID_VALUE ,
227
+ target = ErrorTarget .TOOL_CALL_ACCURACY_EVALUATOR ,
228
+ )
223
229
224
230
async def _real_call (self , ** kwargs ):
225
231
"""The asynchronous call where real end-to-end evaluation logic is performed.
@@ -231,13 +237,55 @@ async def _real_call(self, **kwargs):
231
237
"""
232
238
# Convert inputs into list of evaluable inputs.
233
239
eval_input_list = self ._convert_kwargs_to_eval_input (** kwargs )
240
+ if len (eval_input_list ) == 0 :
241
+ return {self ._AGGREGATE_RESULT_KEY : self ._NOT_APPLICABLE_RESULT ,
242
+ f"{ self ._AGGREGATE_RESULT_KEY } _result" : self ._NOT_APPLICABLE_RESULT ,
243
+ f"{ self ._AGGREGATE_RESULT_KEY } _threshold" : self .threshold ,
244
+ f"{ self ._AGGREGATE_RESULT_KEY } _reason" :
245
+ "No tool calls were made." ,
246
+ "per_tool_call_details" : []
247
+ }
248
+
234
249
per_turn_results = []
235
250
# Evaluate all inputs.
236
251
for eval_input in eval_input_list :
237
- per_turn_results .append (await self ._do_eval (eval_input ))
252
+ if self ._is_applicable_tool (eval_input ):
253
+ per_turn_results .append (await self ._do_eval (eval_input ))
254
+ else :
255
+ per_turn_results .append (self ._not_applicable_result (eval_input ))
238
256
239
257
return self ._aggregate_results (per_turn_results = per_turn_results )
240
258
259
+ def _is_applicable_tool (self , eval_input ):
260
+ """Determine if a given tool should be evaluated, since we only evaluate tools that
261
+ have sufficient context available.
262
+
263
+ :type eval_input: Dict
264
+ :return: True if the tool call should be evaluated
265
+ :rtype: bool
266
+ """
267
+ tool_definition = eval_input .get ("tool_definition" )
268
+ if tool_definition is None or len (tool_definition ) != 1 :
269
+ return False
270
+ tool_type = tool_definition [0 ].get ("type" )
271
+ if tool_type is None or tool_type != "function" :
272
+ return False
273
+ return True
274
+
275
+ def _not_applicable_result (self , eval_input ):
276
+ """Return a result indicating that the tool call is not applicable for evaluation.
277
+
278
+ :param eval_input: The input to the evaluator.
279
+ :type eval_input: Dict
280
+ :return: A dictionary containing the result of the evaluation.
281
+ :rtype: Dict[str, Union[str, float]]
282
+ """
283
+ return {
284
+ f"{ self ._result_key } " : self ._NOT_APPLICABLE_RESULT ,
285
+ f"{ self ._result_key } _reason" : "Tool call not supported for evaluation" ,
286
+ "tool_call_id" : eval_input .get ("tool_call" ).get ("tool_call_id" ),
287
+ }
288
+
241
289
def _aggregate_results (self , per_turn_results ):
242
290
"""Aggregate the evaluation results of each conversation turn into a single result.
243
291
@@ -260,11 +308,23 @@ def _aggregate_results(self, per_turn_results):
260
308
# Go over each turn, and rotate the results into a
261
309
# metric: List[values] format for the evals_per_turn dictionary.
262
310
263
- score = sum ([1 if per_turn_result .get (self ._result_key ) else 0 for per_turn_result in per_turn_results ])/ len (per_turn_results )
311
+ num_evaluated = len ([per_turn_result for per_turn_result in per_turn_results
312
+ if per_turn_result .get (self ._result_key ) != self ._NOT_APPLICABLE_RESULT ])
313
+ if num_evaluated == 0 :
314
+ # None of the invoked tools were applicable, return not applicable result
315
+ # (If a tool fails evaluation, we'll throw an exception)
316
+ return {self ._AGGREGATE_RESULT_KEY : self ._NOT_APPLICABLE_RESULT ,
317
+ f"{ self ._AGGREGATE_RESULT_KEY } _result" : self ._NOT_APPLICABLE_RESULT ,
318
+ f"{ self ._AGGREGATE_RESULT_KEY } _threshold" : self .threshold ,
319
+ f"{ self ._AGGREGATE_RESULT_KEY } _reason" :
320
+ "Tool call accuracy evaluation is not yet supported for the invoked tools." ,
321
+ "per_tool_call_details" : []
322
+ }
323
+ # ignore not_applicable results, where the _result_key will be "not applicable"
324
+ score = sum ([per_turn_result .get (self ._result_key ) == True for per_turn_result in per_turn_results ])/ num_evaluated
264
325
aggregated [self ._AGGREGATE_RESULT_KEY ] = score
265
- aggregated [f'{ self ._AGGREGATE_RESULT_KEY } _result' ] = 'pass' if score >= self .threshold else 'fail'
326
+ aggregated [f'{ self ._AGGREGATE_RESULT_KEY } _result' ] = self . _PASS_RESULT if score >= self .threshold else self . _FAIL_RESULT
266
327
aggregated [f'{ self ._AGGREGATE_RESULT_KEY } _threshold' ] = self .threshold
267
-
268
328
aggregated ["per_tool_call_details" ] = per_turn_results
269
329
return aggregated
270
330
0 commit comments