@@ -214,12 +214,18 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
214214 score = math .nan
215215 if llm_output :
216216 score , reason = parse_quality_evaluator_reason_score (llm_output , valid_score_range = "[0-1]" )
217- return {
218- self ._result_key : bool (float (score )),
219- f"{ self ._result_key } _reason" : reason ,
220- "tool_call_id" : eval_input .get ("tool_call" ).get ("tool_call_id" ),
221- }
222- return {self ._result_key : float (score )}
217+ if score >= 0 and score <= 1 :
218+ return {
219+ self ._result_key : bool (float (score )),
220+ f"{ self ._result_key } _reason" : reason ,
221+ "tool_call_id" : eval_input .get ("tool_call" ).get ("tool_call_id" ),
222+ }
223+ raise EvaluationException (
224+ message = "Tool call accuracy evaluator: Invalid score returned from LLM." ,
225+ blame = ErrorBlame .SYSTEM_ERROR ,
226+ category = ErrorCategory .INVALID_VALUE ,
227+ target = ErrorTarget .TOOL_CALL_ACCURACY_EVALUATOR ,
228+ )
223229
224230 async def _real_call (self , ** kwargs ):
225231 """The asynchronous call where real end-to-end evaluation logic is performed.
@@ -231,13 +237,55 @@ async def _real_call(self, **kwargs):
231237 """
232238 # Convert inputs into list of evaluable inputs.
233239 eval_input_list = self ._convert_kwargs_to_eval_input (** kwargs )
240+ if len (eval_input_list ) == 0 :
241+ return {self ._AGGREGATE_RESULT_KEY : self ._NOT_APPLICABLE_RESULT ,
242+ f"{ self ._AGGREGATE_RESULT_KEY } _result" : self ._NOT_APPLICABLE_RESULT ,
243+ f"{ self ._AGGREGATE_RESULT_KEY } _threshold" : self .threshold ,
244+ f"{ self ._AGGREGATE_RESULT_KEY } _reason" :
245+ "No tool calls were made." ,
246+ "per_tool_call_details" : []
247+ }
248+
234249 per_turn_results = []
235250 # Evaluate all inputs.
236251 for eval_input in eval_input_list :
237- per_turn_results .append (await self ._do_eval (eval_input ))
252+ if self ._is_applicable_tool (eval_input ):
253+ per_turn_results .append (await self ._do_eval (eval_input ))
254+ else :
255+ per_turn_results .append (self ._not_applicable_result (eval_input ))
238256
239257 return self ._aggregate_results (per_turn_results = per_turn_results )
240258
259+ def _is_applicable_tool (self , eval_input ):
260+ """Determine if a given tool should be evaluated, since we only evaluate tools that
261+ have sufficient context available.
262+
263+ :type eval_input: Dict
264+ :return: True if the tool call should be evaluated
265+ :rtype: bool
266+ """
267+ tool_definition = eval_input .get ("tool_definition" )
268+ if tool_definition is None or len (tool_definition ) != 1 :
269+ return False
270+ tool_type = tool_definition [0 ].get ("type" )
271+ if tool_type is None or tool_type != "function" :
272+ return False
273+ return True
274+
275+ def _not_applicable_result (self , eval_input ):
276+ """Return a result indicating that the tool call is not applicable for evaluation.
277+
278+ :param eval_input: The input to the evaluator.
279+ :type eval_input: Dict
280+ :return: A dictionary containing the result of the evaluation.
281+ :rtype: Dict[str, Union[str, float]]
282+ """
283+ return {
284+ f"{ self ._result_key } " : self ._NOT_APPLICABLE_RESULT ,
285+ f"{ self ._result_key } _reason" : "Tool call not supported for evaluation" ,
286+ "tool_call_id" : eval_input .get ("tool_call" ).get ("tool_call_id" ),
287+ }
288+
241289 def _aggregate_results (self , per_turn_results ):
242290 """Aggregate the evaluation results of each conversation turn into a single result.
243291
@@ -260,11 +308,23 @@ def _aggregate_results(self, per_turn_results):
260308 # Go over each turn, and rotate the results into a
261309 # metric: List[values] format for the evals_per_turn dictionary.
262310
263- score = sum ([1 if per_turn_result .get (self ._result_key ) else 0 for per_turn_result in per_turn_results ])/ len (per_turn_results )
311+ num_evaluated = len ([per_turn_result for per_turn_result in per_turn_results
312+ if per_turn_result .get (self ._result_key ) != self ._NOT_APPLICABLE_RESULT ])
313+ if num_evaluated == 0 :
314+ # None of the invoked tools were applicable, return not applicable result
315+ # (If a tool fails evaluation, we'll throw an exception)
316+ return {self ._AGGREGATE_RESULT_KEY : self ._NOT_APPLICABLE_RESULT ,
317+ f"{ self ._AGGREGATE_RESULT_KEY } _result" : self ._NOT_APPLICABLE_RESULT ,
318+ f"{ self ._AGGREGATE_RESULT_KEY } _threshold" : self .threshold ,
319+ f"{ self ._AGGREGATE_RESULT_KEY } _reason" :
320+ "Tool call accuracy evaluation is not yet supported for the invoked tools." ,
321+ "per_tool_call_details" : []
322+ }
323+ # ignore not_applicable results, where the _result_key will be "not applicable"
324+ score = sum ([per_turn_result .get (self ._result_key ) == True for per_turn_result in per_turn_results ])/ num_evaluated
264325 aggregated [self ._AGGREGATE_RESULT_KEY ] = score
265- aggregated [f'{ self ._AGGREGATE_RESULT_KEY } _result' ] = 'pass' if score >= self .threshold else 'fail'
326+ aggregated [f'{ self ._AGGREGATE_RESULT_KEY } _result' ] = self . _PASS_RESULT if score >= self .threshold else self . _FAIL_RESULT
266327 aggregated [f'{ self ._AGGREGATE_RESULT_KEY } _threshold' ] = self .threshold
267-
268328 aggregated ["per_tool_call_details" ] = per_turn_results
269329 return aggregated
270330
0 commit comments