Tools Evaluators Fix: Check input tool_definitions for built-in tools to handle Agent v2 case (#44188)

m7md7sien · web-flow · commit c922f0a45e85 · 2025-12-22T19:25:49.000Z
* Tools Evaluators Fix: Check input tool_definitions for built-in tools to handle Agent v2 case

* Fix unittests
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py
@@ -287,8 +287,7 @@ def _extract_needed_tool_definitions(
                     elif tool_name:
                         # This is a regular function tool from converter
                         tool_definition_exists = any(
-                            tool.get("name") == tool_name and tool.get("type", "function") == "function"
-                            for tool in tool_definitions_expanded
+                            tool.get("name") == tool_name for tool in tool_definitions_expanded
                         )
                         if not tool_definition_exists:
                             raise EvaluationException(
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py
@@ -297,7 +297,7 @@ def test_evaluate_tools_one_eval_fails(self, mock_model_config):
 
         assert "Invalid score value" in str(exc_info.value)
 
-    def test_evaluate_tools_some_not_applicable(self, mock_model_config):
+    def test_evaluate_tools_some_missing_tool_definitions(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
         evaluator._flow = MagicMock(side_effect=flow_side_effect)
 
@@ -331,21 +331,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
                         }
                     },
                 },
-            },
-            {
-                "name": "buy_jacket",
-                "type": "another_built_in",  # This tool will be filtered out
-                "description": "Buy a jacket of the given type.",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "type": {
-                            "type": "string",
-                            "description": "The type of jacket to buy.",
-                        }
-                    },
-                },
-            },
+            },  # buy_jacket definition is missing
         ]
         result = evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)
 
@@ -357,7 +343,7 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
         assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
         assert result[f"{key}_details"] == {}
 
-    def test_evaluate_tools_all_not_applicable(self, mock_model_config):
+    def test_evaluate_tools_built_in_tool_definition(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
         evaluator._flow = MagicMock(side_effect=flow_side_effect)
 
@@ -374,7 +360,7 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
         tool_definitions = [
             {
                 "name": "fetch_weather",
-                "type": "some_built_in",  # Not a 'function' type
+                "type": "some_built_in",  # Not a 'function' type but shouldn't be filtered out
                 "description": "Fetches the weather information for the specified location.",
                 "parameters": {
                     "type": "object",
@@ -391,11 +377,13 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
 
         key = ToolCallAccuracyEvaluator._RESULT_KEY
         assert result is not None
-        assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
+        assert key in result and f"{key}_result" in result and f"{key}_threshold" in result
+        assert result[key] == 5.0  # All good gets score 5
         assert result[f"{key}_result"] == "pass"
         assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
-        assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
-        assert result[f"{key}_details"] == {}
+        assert f"{key}_reason" in result
+        assert result[f"{key}_reason"] == "Evaluated 1 tool calls with 1 correct calls."
+        assert f"{key}_details" in result
 
     def test_evaluate_tools_no_tools(self, mock_model_config):
         evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)