Fix flow structure for Task Completion and Tool Call Accuracy evaluators (#43646)

salma-elshafey · Salma Elshafey · web-flow · commit d532ddfe011a · 2025-10-28T07:19:13.000-07:00
* Fix Tool Call Accuracy after latest base eval updates

* Fix Task Completion after latest base eval changes

* Tool Call Accuracy Fix

* remove gpt_ key

* to re-trigger build pipeline

* pass threshold to eval initialization

* rename details key and add gpt_ prefix to tool call accuracy

---------

Co-authored-by: Salma Elshafey &lt;selshafey@microsoft.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py
@@ -149,7 +149,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]:  # ty
         if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
             eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
 
-        llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        llm_output = prompty_output_dict.get("llm_output", {})
+
         if isinstance(llm_output, dict):
             success = llm_output.get("success", False)
             if isinstance(success, str):
@@ -162,6 +164,13 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[bool, str]]:  # ty
                 f"{self._result_key}_result": success_result,
                 f"{self._result_key}_reason": reason,
                 f"{self._result_key}_details": llm_output.get("details", ""),
+                f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
+                f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
+                f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
+                f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
+                f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
+                f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
+                f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
             }
         if logger:
             logger.warning("LLM output is not a dictionary, returning False for the success.")
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -132,6 +132,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             credential=credential,
+            threshold=threshold,
             **kwargs,
         )
 
@@ -235,8 +236,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         :rtype: Dict
         """
         # Single LLM call for all tool calls
-        llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-
+        prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        llm_output = prompty_output_dict.get("llm_output", {})
         if isinstance(llm_output, dict):
             score = llm_output.get(self._LLM_SCORE_KEY, None)
             if not score or not check_score_is_valid(
@@ -257,10 +258,18 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             score_result = "pass" if score >= self.threshold else "fail"
             response_dict = {
                 self._result_key: score,
+                f"gpt_{self._result_key}": score,
                 f"{self._result_key}_result": score_result,
-                f"{self._result_key}_threshold": self.threshold,
+                f"{self._result_key}_threshold": self._threshold,
                 f"{self._result_key}_reason": reason,
-                "details": llm_output.get("details", {}),
+                f"{self._result_key}_details": llm_output.get("details", {}),
+                f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
+                f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
+                f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
+                f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
+                f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
+                f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
+                f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
             }
             return response_dict
 
@@ -275,9 +284,9 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
     async def _real_call(self, **kwargs):
         """The asynchronous call where real end-to-end evaluation logic is performed.
 
-        :keyword kwargs: The inputs to evaluate.
+        :keyword kwargs: The inputs to evaluate
         :type kwargs: Dict
-        :return: The evaluation result.
+        :return: The evaluation result
         :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
         """
         # Convert inputs into list of evaluable inputs.
@@ -300,10 +309,11 @@ def _not_applicable_result(self, error_message):
         # If no tool calls were made or tool call type is not supported, return not applicable result
         return {
             self._result_key: self._NOT_APPLICABLE_RESULT,
+            f"gpt_{self._result_key}": self._NOT_APPLICABLE_RESULT,
             f"{self._result_key}_result": "pass",
             f"{self._result_key}_threshold": self.threshold,
             f"{self._result_key}_reason": error_message,
-            "details": {},
+            f"{self._result_key}_details": {},
         }
 
     def _extract_needed_tool_definitions(self, tool_calls, tool_definitions):
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -47,9 +47,11 @@ async def flow_side_effect(timeout, **kwargs):
     if invalid_calls > 0:
         # Return a non-numeric score to trigger an exception in the evaluator's check_score_is_valid
         return {
-            "chain_of_thought": "The tool calls were very correct that I returned a huge number!",
-            "tool_calls_success_level": 25,
-            "additional_details": {},
+            "llm_output": {
+                "chain_of_thought": "The tool calls were very correct that I returned a huge number!",
+                "tool_calls_success_level": 25,
+                "details": {},
+            }
         }
 
     score = 1  # Default score for "all bad"
@@ -60,12 +62,14 @@ async def flow_side_effect(timeout, **kwargs):
             score = 3  # Mixed good and bad
 
     return {
-        "chain_of_thought": f"Evaluated {total_calls} tool calls with {total_good_calls} correct calls.",
-        "tool_calls_success_level": score,
-        "additional_details": {
-            "tool_calls_made_by_agent": total_calls,
-            "correct_tool_calls_made_by_agent": total_good_calls,
-        },
+        "llm_output": {
+            "chain_of_thought": f"Evaluated {total_calls} tool calls with {total_good_calls} correct calls.",
+            "tool_calls_success_level": score,
+            "details": {
+                "tool_calls_made_by_agent": total_calls,
+                "correct_tool_calls_made_by_agent": total_good_calls,
+            },
+        }
     }
 
 
@@ -132,7 +136,7 @@ def test_evaluate_tools_valid1(self, mock_model_config):
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls."
-        assert "details" in result
+        assert f"{key}_details" in result
 
     def test_evaluate_tools_valid2(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -194,7 +198,7 @@ def test_evaluate_tools_valid2(self, mock_model_config):
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls."
-        assert "details" in result
+        assert f"{key}_details" in result
 
     def test_evaluate_tools_valid3(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -256,7 +260,7 @@ def test_evaluate_tools_valid3(self, mock_model_config):
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert f"{key}_reason" in result
         assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls."
-        assert "details" in result
+        assert f"{key}_details" in result
 
     def test_evaluate_tools_one_eval_fails(self, mock_model_config):
         with pytest.raises(EvaluationException) as exc_info:
@@ -351,7 +355,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
-        assert result["details"] == {}
+        assert result[f"{key}_details"] == {}
 
     def test_evaluate_tools_all_not_applicable(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -391,7 +395,7 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
-        assert result["details"] == {}
+        assert result[f"{key}_details"] == {}
 
     def test_evaluate_tools_no_tools(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
@@ -424,7 +428,7 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
         assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE
-        assert result["details"] == {}
+        assert result[f"{key}_details"] == {}
 
     def test_evaluate_bing_custom_search(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)