Skip to content

Commit d532ddf

Browse files
salma-elshafeySalma Elshafey
andauthored
Fix flow structure for Task Completion and Tool Call Accuracy evaluators (#43646)
* Fix Tool Call Accuracy after latest base eval updates * Fix Task Completion after latest base eval changes * Tool Call Accuracy Fix * remove gpt_ key * to re-trigger build pipeline * pass threshold to eval initialization * rename details key and add gpt_ prefix to tool call accuracy --------- Co-authored-by: Salma Elshafey <[email protected]>
1 parent ef340f3 commit d532ddf

File tree

3 files changed

+46
-23
lines changed

3 files changed

+46
-23
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # ty
149149
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
150150
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
151151

152-
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
152+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
153+
llm_output = prompty_output_dict.get("llm_output", {})
154+
153155
if isinstance(llm_output, dict):
154156
success = llm_output.get("success", False)
155157
if isinstance(success, str):
@@ -162,6 +164,13 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]: # ty
162164
f"{self._result_key}_result": success_result,
163165
f"{self._result_key}_reason": reason,
164166
f"{self._result_key}_details": llm_output.get("details", ""),
167+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
168+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
169+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
170+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
171+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
172+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
173+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
165174
}
166175
if logger:
167176
logger.warning("LLM output is not a dictionary, returning False for the success.")

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
132132
prompty_file=prompty_path,
133133
result_key=self._RESULT_KEY,
134134
credential=credential,
135+
threshold=threshold,
135136
**kwargs,
136137
)
137138

@@ -235,8 +236,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
235236
:rtype: Dict
236237
"""
237238
# Single LLM call for all tool calls
238-
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
239-
239+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
240+
llm_output = prompty_output_dict.get("llm_output", {})
240241
if isinstance(llm_output, dict):
241242
score = llm_output.get(self._LLM_SCORE_KEY, None)
242243
if not score or not check_score_is_valid(
@@ -257,10 +258,18 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
257258
score_result = "pass" if score >= self.threshold else "fail"
258259
response_dict = {
259260
self._result_key: score,
261+
f"gpt_{self._result_key}": score,
260262
f"{self._result_key}_result": score_result,
261-
f"{self._result_key}_threshold": self.threshold,
263+
f"{self._result_key}_threshold": self._threshold,
262264
f"{self._result_key}_reason": reason,
263-
"details": llm_output.get("details", {}),
265+
f"{self._result_key}_details": llm_output.get("details", {}),
266+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
267+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
268+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
269+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
270+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
271+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
272+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
264273
}
265274
return response_dict
266275

@@ -275,9 +284,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
275284
async def _real_call(self, **kwargs):
276285
"""The asynchronous call where real end-to-end evaluation logic is performed.
277286
278-
:keyword kwargs: The inputs to evaluate.
287+
:keyword kwargs: The inputs to evaluate
279288
:type kwargs: Dict
280-
:return: The evaluation result.
289+
:return: The evaluation result
281290
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
282291
"""
283292
# Convert inputs into list of evaluable inputs.
@@ -300,10 +309,11 @@ def _not_applicable_result(self, error_message):
300309
# If no tool calls were made or tool call type is not supported, return not applicable result
301310
return {
302311
self._result_key: self._NOT_APPLICABLE_RESULT,
312+
f"gpt_{self._result_key}": self._NOT_APPLICABLE_RESULT,
303313
f"{self._result_key}_result": "pass",
304314
f"{self._result_key}_threshold": self.threshold,
305315
f"{self._result_key}_reason": error_message,
306-
"details": {},
316+
f"{self._result_key}_details": {},
307317
}
308318

309319
def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,11 @@ async def flow_side_effect(timeout, **kwargs):
4747
if invalid_calls > 0:
4848
# Return a non-numeric score to trigger an exception in the evaluator's check_score_is_valid
4949
return {
50-
"chain_of_thought": "The tool calls were very correct that I returned a huge number!",
51-
"tool_calls_success_level": 25,
52-
"additional_details": {},
50+
"llm_output": {
51+
"chain_of_thought": "The tool calls were very correct that I returned a huge number!",
52+
"tool_calls_success_level": 25,
53+
"details": {},
54+
}
5355
}
5456

5557
score = 1 # Default score for "all bad"
@@ -60,12 +62,14 @@ async def flow_side_effect(timeout, **kwargs):
6062
score = 3 # Mixed good and bad
6163

6264
return {
63-
"chain_of_thought": f"Evaluated {total_calls} tool calls with {total_good_calls} correct calls.",
64-
"tool_calls_success_level": score,
65-
"additional_details": {
66-
"tool_calls_made_by_agent": total_calls,
67-
"correct_tool_calls_made_by_agent": total_good_calls,
68-
},
65+
"llm_output": {
66+
"chain_of_thought": f"Evaluated {total_calls} tool calls with {total_good_calls} correct calls.",
67+
"tool_calls_success_level": score,
68+
"details": {
69+
"tool_calls_made_by_agent": total_calls,
70+
"correct_tool_calls_made_by_agent": total_good_calls,
71+
},
72+
}
6973
}
7074

7175

@@ -132,7 +136,7 @@ def test_evaluate_tools_valid1(self, mock_model_config):
132136
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
133137
assert f"{key}_reason" in result
134138
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls."
135-
assert "details" in result
139+
assert f"{key}_details" in result
136140

137141
def test_evaluate_tools_valid2(self, mock_model_config):
138142
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -194,7 +198,7 @@ def test_evaluate_tools_valid2(self, mock_model_config):
194198
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
195199
assert f"{key}_reason" in result
196200
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls."
197-
assert "details" in result
201+
assert f"{key}_details" in result
198202

199203
def test_evaluate_tools_valid3(self, mock_model_config):
200204
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -256,7 +260,7 @@ def test_evaluate_tools_valid3(self, mock_model_config):
256260
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
257261
assert f"{key}_reason" in result
258262
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls."
259-
assert "details" in result
263+
assert f"{key}_details" in result
260264

261265
def test_evaluate_tools_one_eval_fails(self, mock_model_config):
262266
with pytest.raises(EvaluationException) as exc_info:
@@ -351,7 +355,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
351355
assert result[f"{key}_result"] == "pass"
352356
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
353357
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
354-
assert result["details"] == {}
358+
assert result[f"{key}_details"] == {}
355359

356360
def test_evaluate_tools_all_not_applicable(self, mock_model_config):
357361
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -391,7 +395,7 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
391395
assert result[f"{key}_result"] == "pass"
392396
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
393397
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
394-
assert result["details"] == {}
398+
assert result[f"{key}_details"] == {}
395399

396400
def test_evaluate_tools_no_tools(self, mock_model_config):
397401
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -424,7 +428,7 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
424428
assert result[f"{key}_result"] == "pass"
425429
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
426430
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE
427-
assert result["details"] == {}
431+
assert result[f"{key}_details"] == {}
428432

429433
def test_evaluate_bing_custom_search(self, mock_model_config):
430434
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)

0 commit comments

Comments
 (0)